2 * Loongson SIMD optimized vp8dsp
4 * Copyright (c) 2016 Loongson Technology Corporation Limited
5 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/attributes.h"
27 #include "libavutil/mips/mmiutils.h"
28 #include "libavutil/mem_internal.h"
30 #define DECLARE_DOUBLE_1 double db_1
31 #define DECLARE_DOUBLE_2 double db_2
32 #define DECLARE_UINT32_T uint32_t it_1
33 #define RESTRICT_ASM_DOUBLE_1 [db_1]"=&f"(db_1)
34 #define RESTRICT_ASM_DOUBLE_2 [db_2]"=&f"(db_2)
35 #define RESTRICT_ASM_UINT32_T [it_1]"=&r"(it_1)
37 #define MMI_PCMPGTUB(dst, src1, src2) \
38 "pcmpeqb %[db_1], "#src1", "#src2" \n\t" \
39 "pmaxub %[db_2], "#src1", "#src2" \n\t" \
40 "pcmpeqb %[db_2], %[db_2], "#src1" \n\t" \
41 "xor "#dst", %[db_2], %[db_1] \n\t"
43 #define MMI_BTOH(dst_l, dst_r, src) \
44 "xor %[db_1], %[db_1], %[db_1] \n\t" \
45 "pcmpgtb %[db_2], %[db_1], "#src" \n\t" \
46 "punpcklbh "#dst_r", "#src", %[db_2] \n\t" \
47 "punpckhbh "#dst_l", "#src", %[db_2] \n\t"
49 #define MMI_VP8_LOOP_FILTER \
50 /* Calculation of hev */ \
51 "dmtc1 %[thresh], %[ftmp3] \n\t" \
52 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
53 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
54 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
55 "pasubub %[ftmp0], %[p1], %[p0] \n\t" \
56 "pasubub %[ftmp1], %[q1], %[q0] \n\t" \
57 "pmaxub %[ftmp0], %[ftmp0], %[ftmp1] \n\t" \
58 MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3]) \
59 /* Calculation of mask */ \
60 "pasubub %[ftmp1], %[p0], %[q0] \n\t" \
61 "paddusb %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
62 "pasubub %[ftmp2], %[p1], %[q1] \n\t" \
63 "li %[tmp0], 0x09 \n\t" \
64 "dmtc1 %[tmp0], %[ftmp3] \n\t" \
65 PSRLB_MMI(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], %[ftmp2]) \
66 "paddusb %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
67 "dmtc1 %[e], %[ftmp3] \n\t" \
68 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
69 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
70 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
71 MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3]) \
72 "pmaxub %[mask], %[mask], %[ftmp0] \n\t" \
73 "pasubub %[ftmp1], %[p3], %[p2] \n\t" \
74 "pasubub %[ftmp2], %[p2], %[p1] \n\t" \
75 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
76 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
77 "pasubub %[ftmp1], %[q3], %[q2] \n\t" \
78 "pasubub %[ftmp2], %[q2], %[q1] \n\t" \
79 "pmaxub %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
80 "pmaxub %[mask], %[mask], %[ftmp1] \n\t" \
81 "dmtc1 %[i], %[ftmp3] \n\t" \
82 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
83 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
84 "punpcklwd %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
85 MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3]) \
86 "pcmpeqw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
87 "xor %[mask], %[mask], %[ftmp3] \n\t" \
89 "li %[tmp0], 0x80808080 \n\t" \
90 "dmtc1 %[tmp0], %[ftmp7] \n\t" \
91 "punpcklwd %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
92 "xor %[p2], %[p2], %[ftmp7] \n\t" \
93 "xor %[p1], %[p1], %[ftmp7] \n\t" \
94 "xor %[p0], %[p0], %[ftmp7] \n\t" \
95 "xor %[q0], %[q0], %[ftmp7] \n\t" \
96 "xor %[q1], %[q1], %[ftmp7] \n\t" \
97 "xor %[q2], %[q2], %[ftmp7] \n\t" \
98 "psubsb %[ftmp4], %[p1], %[q1] \n\t" \
99 "psubb %[ftmp5], %[q0], %[p0] \n\t" \
100 MMI_BTOH(%[ftmp1], %[ftmp0], %[ftmp5]) \
101 MMI_BTOH(%[ftmp3], %[ftmp2], %[ftmp4]) \
103 "paddh %[ftmp5], %[ftmp0], %[ftmp0] \n\t" \
104 "paddh %[ftmp0], %[ftmp0], %[ftmp5] \n\t" \
105 "paddh %[ftmp0], %[ftmp2], %[ftmp0] \n\t" \
107 "paddh %[ftmp5], %[ftmp1], %[ftmp1] \n\t" \
108 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
109 "paddh %[ftmp1], %[ftmp3], %[ftmp1] \n\t" \
110 /* Combine left and right part */ \
111 "packsshb %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
112 "and %[ftmp1], %[ftmp1], %[mask] \n\t" \
113 "and %[ftmp2], %[ftmp1], %[hev] \n\t" \
114 "li %[tmp0], 0x04040404 \n\t" \
115 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
116 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
117 "paddsb %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
118 "li %[tmp0], 0x0B \n\t" \
119 "dmtc1 %[tmp0], %[ftmp4] \n\t" \
120 PSRAB_MMI(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], %[ftmp3]) \
121 "li %[tmp0], 0x03030303 \n\t" \
122 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
123 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
124 "paddsb %[ftmp4], %[ftmp2], %[ftmp0] \n\t" \
125 "li %[tmp0], 0x0B \n\t" \
126 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
127 PSRAB_MMI(%[ftmp4], %[ftmp2], %[ftmp5], %[ftmp6], %[ftmp4]) \
128 "psubsb %[q0], %[q0], %[ftmp3] \n\t" \
129 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
130 /* filt_val &= ~hev */ \
131 "pcmpeqw %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
132 "xor %[hev], %[hev], %[ftmp0] \n\t" \
133 "and %[ftmp1], %[ftmp1], %[hev] \n\t" \
134 MMI_BTOH(%[ftmp5], %[ftmp6], %[ftmp1]) \
135 "li %[tmp0], 0x07 \n\t" \
136 "dmtc1 %[tmp0], %[ftmp2] \n\t" \
137 "li %[tmp0], 0x001b001b \n\t" \
138 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
139 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
140 "li %[tmp0], 0x003f003f \n\t" \
141 "dmtc1 %[tmp0], %[ftmp0] \n\t" \
142 "punpcklwd %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
144 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
145 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
146 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
148 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
149 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
150 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
151 /* Combine left and right part */ \
152 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
153 "psubsb %[q0], %[q0], %[ftmp4] \n\t" \
154 "xor %[q0], %[q0], %[ftmp7] \n\t" \
155 "paddsb %[p0], %[p0], %[ftmp4] \n\t" \
156 "xor %[p0], %[p0], %[ftmp7] \n\t" \
157 "li %[tmp0], 0x00120012 \n\t" \
158 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
159 "punpcklwd %[ftmp1], %[ftmp1], %[ftmp1] \n\t" \
161 "pmullh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
162 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
163 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
165 "pmullh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
166 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
167 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
168 /* Combine left and right part */ \
169 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
170 "psubsb %[q1], %[q1], %[ftmp4] \n\t" \
171 "xor %[q1], %[q1], %[ftmp7] \n\t" \
172 "paddsb %[p1], %[p1], %[ftmp4] \n\t" \
173 "xor %[p1], %[p1], %[ftmp7] \n\t" \
174 "li %[tmp0], 0x03 \n\t" \
175 "dmtc1 %[tmp0], %[ftmp1] \n\t" \
177 "psllh %[ftmp3], %[ftmp6], %[ftmp1] \n\t" \
178 "paddh %[ftmp3], %[ftmp3], %[ftmp6] \n\t" \
179 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
180 "psrah %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
182 "psllh %[ftmp4], %[ftmp5], %[ftmp1] \n\t" \
183 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
184 "paddh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
185 "psrah %[ftmp4], %[ftmp4], %[ftmp2] \n\t" \
186 /* Combine left and right part */ \
187 "packsshb %[ftmp4], %[ftmp3], %[ftmp4] \n\t" \
188 "psubsb %[q2], %[q2], %[ftmp4] \n\t" \
189 "xor %[q2], %[q2], %[ftmp7] \n\t" \
190 "paddsb %[p2], %[p2], %[ftmp4] \n\t" \
191 "xor %[p2], %[p2], %[ftmp7] \n\t"
193 #define PUT_VP8_EPEL4_H6_MMI(src, dst) \
194 MMI_ULWC1(%[ftmp1], src, 0x00) \
195 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
196 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
198 MMI_ULWC1(%[ftmp1], src, -0x01) \
199 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
200 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
201 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
203 MMI_ULWC1(%[ftmp1], src, -0x02) \
204 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
205 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
206 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
208 MMI_ULWC1(%[ftmp1], src, 0x01) \
209 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
210 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
212 MMI_ULWC1(%[ftmp1], src, 0x02) \
213 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
214 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
215 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
217 MMI_ULWC1(%[ftmp1], src, 0x03) \
218 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
219 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
220 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
222 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
223 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
224 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
225 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
227 MMI_SWC1(%[ftmp1], dst, 0x00)
230 #define PUT_VP8_EPEL4_H4_MMI(src, dst) \
231 MMI_ULWC1(%[ftmp1], src, 0x00) \
232 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
233 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
235 MMI_ULWC1(%[ftmp1], src, -0x01) \
236 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
237 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
238 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
240 MMI_ULWC1(%[ftmp1], src, 0x01) \
241 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
242 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
244 MMI_ULWC1(%[ftmp1], src, 0x02) \
245 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
246 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
247 "psubh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
249 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
251 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
252 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
254 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
255 MMI_SWC1(%[ftmp1], dst, 0x00)
258 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride) \
259 MMI_ULWC1(%[ftmp1], src, 0x00) \
260 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
261 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
263 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
264 MMI_ULWC1(%[ftmp1], src1, 0x00) \
265 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
266 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
267 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
269 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
270 MMI_ULWC1(%[ftmp1], src1, 0x00) \
271 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
272 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
273 "paddsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
275 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
276 MMI_ULWC1(%[ftmp1], src1, 0x00) \
277 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
278 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
280 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
281 MMI_ULWC1(%[ftmp1], src1, 0x00) \
282 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
283 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
284 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
286 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
287 MMI_ULWC1(%[ftmp1], src1, 0x00) \
288 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
289 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
290 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
292 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
294 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
295 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
296 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
298 MMI_SWC1(%[ftmp1], dst, 0x00)
301 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride) \
302 MMI_ULWC1(%[ftmp1], src, 0x00) \
303 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
304 "pmullh %[ftmp3], %[ftmp2], %[filter2] \n\t" \
306 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
307 MMI_ULWC1(%[ftmp1], src1, 0x00) \
308 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
309 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
310 "psubsh %[ftmp5], %[ftmp3], %[ftmp2] \n\t" \
312 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
313 MMI_ULWC1(%[ftmp1], src1, 0x00) \
314 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
315 "pmullh %[ftmp3], %[ftmp2], %[filter3] \n\t" \
317 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
318 MMI_ULWC1(%[ftmp1], src1, 0x00) \
319 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
320 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
321 "psubsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
323 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
325 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_64] \n\t" \
326 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
327 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
329 MMI_SWC1(%[ftmp1], dst, 0x00)
332 #define PUT_VP8_EPEL8_H6_MMI(src, dst) \
333 MMI_ULDC1(%[ftmp1], src, 0x00) \
334 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
335 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
336 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
337 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
339 MMI_ULDC1(%[ftmp1], src, -0x01) \
340 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
341 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
342 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
343 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
344 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
345 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
347 MMI_ULDC1(%[ftmp1], src, -0x02) \
348 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
349 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
350 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
351 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
352 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
353 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
355 MMI_ULDC1(%[ftmp1], src, 0x01) \
356 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
357 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
358 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
359 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
361 MMI_ULDC1(%[ftmp1], src, 0x02) \
362 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
363 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
364 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
365 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
366 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
367 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
369 MMI_ULDC1(%[ftmp1], src, 0x03) \
370 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
371 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
372 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
373 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
374 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
375 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
377 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
378 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
380 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
381 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
382 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
383 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
384 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
386 MMI_SDC1(%[ftmp1], dst, 0x00)
389 #define PUT_VP8_EPEL8_H4_MMI(src, dst) \
390 MMI_ULDC1(%[ftmp1], src, 0x00) \
391 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
392 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
393 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
394 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
396 MMI_ULDC1(%[ftmp1], src, -0x01) \
397 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
398 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
399 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
400 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
401 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
402 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
404 MMI_ULDC1(%[ftmp1], src, 0x01) \
405 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
406 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
407 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
408 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
410 MMI_ULDC1(%[ftmp1], src, 0x02) \
411 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
412 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
413 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
414 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
415 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
416 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
418 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
419 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
421 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
422 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
423 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
424 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
426 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
427 MMI_SDC1(%[ftmp1], dst, 0x00)
430 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride) \
431 MMI_ULDC1(%[ftmp1], src, 0x00) \
432 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
433 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
434 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
435 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
437 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
438 MMI_ULDC1(%[ftmp1], src1, 0x00) \
439 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
440 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
441 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
442 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
443 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
444 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
446 PTR_SUBU ""#src1", "#src1", "#srcstride" \n\t" \
447 MMI_ULDC1(%[ftmp1], src1, 0x00) \
448 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
449 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
450 "pmullh %[ftmp2], %[ftmp2], %[filter0] \n\t" \
451 "pmullh %[ftmp3], %[ftmp3], %[filter0] \n\t" \
452 "paddsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
453 "paddsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
455 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
456 MMI_ULDC1(%[ftmp1], src1, 0x00) \
457 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
458 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
459 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
460 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
462 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
463 MMI_ULDC1(%[ftmp1], src1, 0x00) \
464 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
465 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
466 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
467 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
468 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
469 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
471 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
472 MMI_ULDC1(%[ftmp1], src1, 0x00) \
473 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
474 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
475 "pmullh %[ftmp2], %[ftmp2], %[filter5] \n\t" \
476 "pmullh %[ftmp3], %[ftmp3], %[filter5] \n\t" \
477 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
478 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
480 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
481 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
483 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
484 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
485 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
486 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
487 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
489 MMI_SDC1(%[ftmp1], dst, 0x00)
492 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride) \
493 MMI_ULDC1(%[ftmp1], src, 0x00) \
494 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
495 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
496 "pmullh %[ftmp5], %[ftmp2], %[filter2] \n\t" \
497 "pmullh %[ftmp6], %[ftmp3], %[filter2] \n\t" \
499 PTR_SUBU ""#src1", "#src", "#srcstride" \n\t" \
500 MMI_ULDC1(%[ftmp1], src1, 0x00) \
501 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
502 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
503 "pmullh %[ftmp2], %[ftmp2], %[filter1] \n\t" \
504 "pmullh %[ftmp3], %[ftmp3], %[filter1] \n\t" \
505 "psubsh %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
506 "psubsh %[ftmp8], %[ftmp6], %[ftmp3] \n\t" \
508 PTR_ADDU ""#src1", "#src", "#srcstride" \n\t" \
509 MMI_ULDC1(%[ftmp1], src1, 0x00) \
510 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
511 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
512 "pmullh %[ftmp5], %[ftmp2], %[filter3] \n\t" \
513 "pmullh %[ftmp6], %[ftmp3], %[filter3] \n\t" \
515 PTR_ADDU ""#src1", "#src1", "#srcstride" \n\t" \
516 MMI_ULDC1(%[ftmp1], src1, 0x00) \
517 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
518 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
519 "pmullh %[ftmp2], %[ftmp2], %[filter4] \n\t" \
520 "pmullh %[ftmp3], %[ftmp3], %[filter4] \n\t" \
521 "psubsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
522 "psubsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
524 "paddsh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
525 "paddsh %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
527 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_64] \n\t" \
528 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_64] \n\t" \
529 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
530 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
531 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
533 MMI_SDC1(%[ftmp1], dst, 0x00)
536 #define PUT_VP8_BILINEAR8_H_MMI(src, dst) \
537 MMI_ULDC1(%[ftmp1], src, 0x00) \
538 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
539 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
540 "pmullh %[ftmp5], %[ftmp2], %[a] \n\t" \
541 "pmullh %[ftmp6], %[ftmp3], %[a] \n\t" \
543 MMI_ULDC1(%[ftmp1], src, 0x01) \
544 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
545 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
546 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
547 "pmullh %[ftmp3], %[ftmp3], %[b] \n\t" \
548 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
549 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
551 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
552 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
553 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
554 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
556 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
557 MMI_SDC1(%[ftmp1], dst, 0x00)
560 #define PUT_VP8_BILINEAR4_H_MMI(src, dst) \
561 MMI_ULWC1(%[ftmp1], src, 0x00) \
562 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
563 "pmullh %[ftmp3], %[ftmp2], %[a] \n\t" \
565 MMI_ULWC1(%[ftmp1], src, 0x01) \
566 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
567 "pmullh %[ftmp2], %[ftmp2], %[b] \n\t" \
568 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
570 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
571 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
573 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
574 MMI_SWC1(%[ftmp1], dst, 0x00)
577 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride) \
578 MMI_ULDC1(%[ftmp1], src, 0x00) \
579 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
580 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
581 "pmullh %[ftmp5], %[ftmp2], %[c] \n\t" \
582 "pmullh %[ftmp6], %[ftmp3], %[c] \n\t" \
584 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
585 MMI_ULDC1(%[ftmp1], src1, 0x00) \
586 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
587 "punpckhbh %[ftmp3], %[ftmp1], %[ftmp0] \n\t" \
588 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
589 "pmullh %[ftmp3], %[ftmp3], %[d] \n\t" \
590 "paddsh %[ftmp5], %[ftmp5], %[ftmp2] \n\t" \
591 "paddsh %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
593 "paddsh %[ftmp5], %[ftmp5], %[ff_pw_4] \n\t" \
594 "paddsh %[ftmp6], %[ftmp6], %[ff_pw_4] \n\t" \
595 "psrah %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
596 "psrah %[ftmp6], %[ftmp6], %[ftmp4] \n\t" \
598 "packushb %[ftmp1], %[ftmp5], %[ftmp6] \n\t" \
599 MMI_SDC1(%[ftmp1], dst, 0x00)
602 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride) \
603 MMI_ULWC1(%[ftmp1], src, 0x00) \
604 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
605 "pmullh %[ftmp3], %[ftmp2], %[c] \n\t" \
607 PTR_ADDU ""#src1", "#src", "#sstride" \n\t" \
608 MMI_ULWC1(%[ftmp1], src1, 0x00) \
609 "punpcklbh %[ftmp2], %[ftmp1], %[ftmp0] \n\t" \
610 "pmullh %[ftmp2], %[ftmp2], %[d] \n\t" \
611 "paddsh %[ftmp3], %[ftmp3], %[ftmp2] \n\t" \
613 "paddsh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t" \
614 "psrah %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
616 "packushb %[ftmp1], %[ftmp3], %[ftmp0] \n\t" \
617 MMI_SWC1(%[ftmp1], dst, 0x00)
620 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
621 {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
622 0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
624 {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
625 0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
627 {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
628 0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
630 {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
631 0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
633 {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
634 0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
636 {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
637 0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
639 {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
640 0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
644 #define FILTER_6TAP(src, F, stride) \
645 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
646 F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] - \
647 F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
649 #define FILTER_4TAP(src, F, stride) \
650 cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] + \
651 F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
653 static const uint8_t subpel_filters[7][6] = {
654 { 0, 6, 123, 12, 1, 0 },
655 { 2, 11, 108, 36, 8, 1 },
656 { 0, 9, 93, 50, 6, 0 },
657 { 3, 16, 77, 77, 16, 3 },
658 { 0, 6, 50, 93, 9, 0 },
659 { 1, 8, 36, 108, 11, 2 },
660 { 0, 1, 12, 123, 6, 0 },
663 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
664 #define MUL_35468(a) (((a) * 35468) >> 16)
667 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
668 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
671 int av_unused p1 = p[-2 * stride];
672 int av_unused p0 = p[-1 * stride];
673 int av_unused q0 = p[ 0 * stride];
674 int av_unused q1 = p[ 1 * stride];
676 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
679 a += clip_int8(p1 - q1);
682 // We deviate from the spec here with c(a+3) >> 3
683 // since that's what libvpx does.
684 f1 = FFMIN(a + 4, 127) >> 3;
685 f2 = FFMIN(a + 3, 127) >> 3;
687 // Despite what the spec says, we do need to clamp here to
688 // be bitexact with libvpx.
689 p[-1 * stride] = cm[p0 + f2];
690 p[ 0 * stride] = cm[q0 - f1];
693 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
696 int av_unused p1 = p[-2 * stride];
697 int av_unused p0 = p[-1 * stride];
698 int av_unused q0 = p[ 0 * stride];
699 int av_unused q1 = p[ 1 * stride];
701 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
706 // We deviate from the spec here with c(a+3) >> 3
707 // since that's what libvpx does.
708 f1 = FFMIN(a + 4, 127) >> 3;
709 f2 = FFMIN(a + 3, 127) >> 3;
711 // Despite what the spec says, we do need to clamp here to
712 // be bitexact with libvpx.
713 p[-1 * stride] = cm[p0 + f2];
714 p[ 0 * stride] = cm[q0 - f1];
716 p[-2 * stride] = cm[p1 + a];
717 p[ 1 * stride] = cm[q1 - a];
720 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
723 int av_unused p1 = p[-2 * stride];
724 int av_unused p0 = p[-1 * stride];
725 int av_unused q0 = p[ 0 * stride];
726 int av_unused q1 = p[ 1 * stride];
728 return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
731 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
733 int av_unused p1 = p[-2 * stride];
734 int av_unused p0 = p[-1 * stride];
735 int av_unused q0 = p[ 0 * stride];
736 int av_unused q1 = p[ 1 * stride];
738 return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
741 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
744 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
746 int av_unused p2 = p[-3 * stride];
747 int av_unused p1 = p[-2 * stride];
748 int av_unused p0 = p[-1 * stride];
749 int av_unused q0 = p[ 0 * stride];
750 int av_unused q1 = p[ 1 * stride];
751 int av_unused q2 = p[ 2 * stride];
753 w = clip_int8(p1 - q1);
754 w = clip_int8(w + 3 * (q0 - p0));
756 a0 = (27 * w + 63) >> 7;
757 a1 = (18 * w + 63) >> 7;
758 a2 = (9 * w + 63) >> 7;
760 p[-3 * stride] = cm[p2 + a2];
761 p[-2 * stride] = cm[p1 + a1];
762 p[-1 * stride] = cm[p0 + a0];
763 p[ 0 * stride] = cm[q0 - a0];
764 p[ 1 * stride] = cm[q1 - a1];
765 p[ 2 * stride] = cm[q2 - a2];
768 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
771 int av_unused p3 = p[-4 * stride];
772 int av_unused p2 = p[-3 * stride];
773 int av_unused p1 = p[-2 * stride];
774 int av_unused p0 = p[-1 * stride];
775 int av_unused q0 = p[ 0 * stride];
776 int av_unused q1 = p[ 1 * stride];
777 int av_unused q2 = p[ 2 * stride];
778 int av_unused q3 = p[ 3 * stride];
780 return vp8_simple_limit(p, stride, E) &&
781 FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
782 FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
783 FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
786 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
787 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
795 /* Get data from dst */
796 "gsldlc1 %[q0], 0x07(%[dst]) \n\t"
797 "gsldrc1 %[q0], 0x00(%[dst]) \n\t"
798 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
799 "gsldlc1 %[p0], 0x07(%[tmp0]) \n\t"
800 "gsldrc1 %[p0], 0x00(%[tmp0]) \n\t"
801 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
802 "gsldlc1 %[p1], 0x07(%[tmp0]) \n\t"
803 "gsldrc1 %[p1], 0x00(%[tmp0]) \n\t"
804 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
805 "gsldlc1 %[p2], 0x07(%[tmp0]) \n\t"
806 "gsldrc1 %[p2], 0x00(%[tmp0]) \n\t"
807 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
808 "gsldlc1 %[p3], 0x07(%[tmp0]) \n\t"
809 "gsldrc1 %[p3], 0x00(%[tmp0]) \n\t"
810 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
811 "gsldlc1 %[q1], 0x07(%[tmp0]) \n\t"
812 "gsldrc1 %[q1], 0x00(%[tmp0]) \n\t"
813 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
814 "gsldlc1 %[q2], 0x07(%[tmp0]) \n\t"
815 "gsldrc1 %[q2], 0x00(%[tmp0]) \n\t"
816 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
817 "gsldlc1 %[q3], 0x07(%[tmp0]) \n\t"
818 "gsldrc1 %[q3], 0x00(%[tmp0]) \n\t"
821 "gssdlc1 %[q0], 0x07(%[dst]) \n\t"
822 "gssdrc1 %[q0], 0x00(%[dst]) \n\t"
823 PTR_SUBU "%[tmp0], %[dst], %[stride] \n\t"
824 "gssdlc1 %[p0], 0x07(%[tmp0]) \n\t"
825 "gssdrc1 %[p0], 0x00(%[tmp0]) \n\t"
826 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
827 "gssdlc1 %[p1], 0x07(%[tmp0]) \n\t"
828 "gssdrc1 %[p1], 0x00(%[tmp0]) \n\t"
829 PTR_SUBU "%[tmp0], %[tmp0], %[stride] \n\t"
830 "gssdlc1 %[p2], 0x07(%[tmp0]) \n\t"
831 "gssdrc1 %[p2], 0x00(%[tmp0]) \n\t"
832 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
833 "gssdlc1 %[q1], 0x07(%[tmp0]) \n\t"
834 "gssdrc1 %[q1], 0x00(%[tmp0]) \n\t"
835 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
836 "gssdlc1 %[q2], 0x07(%[tmp0]) \n\t"
837 "gssdrc1 %[q2], 0x00(%[tmp0]) \n\t"
838 : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
839 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
840 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
841 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
842 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
843 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
844 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
845 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
846 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
847 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
848 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
849 RESTRICT_ASM_UINT32_T
850 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
851 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
856 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
857 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
861 for (i = 0; i < 8; i++)
862 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
863 int hv = hev(dst + i * 1, stride, hev_thresh);
865 vp8_filter_common_is4tap(dst + i * 1, stride);
867 vp8_filter_common_isnot4tap(dst + i * 1, stride);
871 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
872 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
880 /* Get data from dst */
881 "gsldlc1 %[p3], 0x03(%[dst]) \n\t"
882 "gsldrc1 %[p3], -0x04(%[dst]) \n\t"
883 PTR_ADDU "%[tmp0], %[dst], %[stride] \n\t"
884 "gsldlc1 %[p2], 0x03(%[tmp0]) \n\t"
885 "gsldrc1 %[p2], -0x04(%[tmp0]) \n\t"
886 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
887 "gsldlc1 %[p1], 0x03(%[tmp0]) \n\t"
888 "gsldrc1 %[p1], -0x04(%[tmp0]) \n\t"
889 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
890 "gsldlc1 %[p0], 0x03(%[tmp0]) \n\t"
891 "gsldrc1 %[p0], -0x04(%[tmp0]) \n\t"
892 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
893 "gsldlc1 %[q0], 0x03(%[tmp0]) \n\t"
894 "gsldrc1 %[q0], -0x04(%[tmp0]) \n\t"
895 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
896 "gsldlc1 %[q1], 0x03(%[tmp0]) \n\t"
897 "gsldrc1 %[q1], -0x04(%[tmp0]) \n\t"
898 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
899 "gsldlc1 %[q2], 0x03(%[tmp0]) \n\t"
900 "gsldrc1 %[q2], -0x04(%[tmp0]) \n\t"
901 PTR_ADDU "%[tmp0], %[tmp0], %[stride] \n\t"
902 "gsldlc1 %[q3], 0x03(%[tmp0]) \n\t"
903 "gsldrc1 %[q3], -0x04(%[tmp0]) \n\t"
904 /* Matrix transpose */
905 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
906 %[q0], %[q1], %[q2], %[q3],
907 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
909 /* Matrix transpose */
910 TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
911 %[q0], %[q1], %[q2], %[q3],
912 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
914 "gssdlc1 %[p3], 0x03(%[dst]) \n\t"
915 "gssdrc1 %[p3], -0x04(%[dst]) \n\t"
916 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
917 "gssdlc1 %[p2], 0x03(%[dst]) \n\t"
918 "gssdrc1 %[p2], -0x04(%[dst]) \n\t"
919 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
920 "gssdlc1 %[p1], 0x03(%[dst]) \n\t"
921 "gssdrc1 %[p1], -0x04(%[dst]) \n\t"
922 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
923 "gssdlc1 %[p0], 0x03(%[dst]) \n\t"
924 "gssdrc1 %[p0], -0x04(%[dst]) \n\t"
925 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
926 "gssdlc1 %[q0], 0x03(%[dst]) \n\t"
927 "gssdrc1 %[q0], -0x04(%[dst]) \n\t"
928 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
929 "gssdlc1 %[q1], 0x03(%[dst]) \n\t"
930 "gssdrc1 %[q1], -0x04(%[dst]) \n\t"
931 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
932 "gssdlc1 %[q2], 0x03(%[dst]) \n\t"
933 "gssdrc1 %[q2], -0x04(%[dst]) \n\t"
934 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
935 "gssdlc1 %[q3], 0x03(%[dst]) \n\t"
936 "gssdrc1 %[q3], -0x04(%[dst]) \n\t"
937 : [p3]"=&f"(ftmp[0]), [p2]"=&f"(ftmp[1]),
938 [p1]"=&f"(ftmp[2]), [p0]"=&f"(ftmp[3]),
939 [q0]"=&f"(ftmp[4]), [q1]"=&f"(ftmp[5]),
940 [q2]"=&f"(ftmp[6]), [q3]"=&f"(ftmp[7]),
941 [ftmp0]"=&f"(ftmp[8]), [ftmp1]"=&f"(ftmp[9]),
942 [ftmp2]"=&f"(ftmp[10]), [ftmp3]"=&f"(ftmp[11]),
943 [hev]"=&f"(ftmp[12]), [mask]"=&f"(ftmp[13]),
944 [ftmp4]"=&f"(ftmp[14]), [ftmp5]"=&f"(ftmp[15]),
945 [ftmp6]"=&f"(ftmp[16]), [ftmp7]"=&f"(ftmp[17]),
946 [dst]"+&r"(dst), [tmp0]"=&r"(tmp[0]),
947 RESTRICT_ASM_DOUBLE_1, RESTRICT_ASM_DOUBLE_2,
948 RESTRICT_ASM_UINT32_T
949 : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
950 [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
955 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
956 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
960 for (i = 0; i < 8; i++)
961 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
962 int hv = hev(dst + i * stride, 1, hev_thresh);
964 vp8_filter_common_is4tap(dst + i * stride, 1);
966 vp8_filter_common_isnot4tap(dst + i * stride, 1);
970 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
977 MMI_LDC1(%[ftmp0], %[dc], 0x00)
978 MMI_LDC1(%[ftmp1], %[dc], 0x08)
979 MMI_LDC1(%[ftmp2], %[dc], 0x10)
980 MMI_LDC1(%[ftmp3], %[dc], 0x18)
981 "paddsh %[ftmp4], %[ftmp0], %[ftmp3] \n\t"
982 "psubsh %[ftmp5], %[ftmp0], %[ftmp3] \n\t"
983 "paddsh %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
984 "psubsh %[ftmp7], %[ftmp1], %[ftmp2] \n\t"
985 "paddsh %[ftmp0], %[ftmp4], %[ftmp6] \n\t"
986 "paddsh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
987 "psubsh %[ftmp2], %[ftmp4], %[ftmp6] \n\t"
988 "psubsh %[ftmp3], %[ftmp5], %[ftmp7] \n\t"
989 MMI_SDC1(%[ftmp0], %[dc], 0x00)
990 MMI_SDC1(%[ftmp1], %[dc], 0x08)
991 MMI_SDC1(%[ftmp2], %[dc], 0x10)
992 MMI_SDC1(%[ftmp3], %[dc], 0x18)
993 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
994 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
995 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
996 [ftmp6]"=&f"(ftmp[6]),
998 [ftmp7]"=&f"(ftmp[7])
999 : [dc]"r"((uint8_t*)dc)
1003 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1004 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1005 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1006 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1008 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1009 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1010 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1011 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1013 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1014 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1015 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1016 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1018 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1019 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1020 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1021 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1024 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1025 MMI_SDC1(%[ftmp0], %[dc], 0x00)
1026 MMI_SDC1(%[ftmp0], %[dc], 0x08)
1027 MMI_SDC1(%[ftmp0], %[dc], 0x10)
1028 MMI_SDC1(%[ftmp0], %[dc], 0x18)
1029 : RESTRICT_ASM_ALL64
1030 [ftmp0]"=&f"(ftmp[0])
1031 : [dc]"r"((uint8_t *)dc)
1035 int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1037 t00 = dc[0] + dc[12];
1038 t10 = dc[1] + dc[13];
1039 t20 = dc[2] + dc[14];
1040 t30 = dc[3] + dc[15];
1042 t03 = dc[0] - dc[12];
1043 t13 = dc[1] - dc[13];
1044 t23 = dc[2] - dc[14];
1045 t33 = dc[3] - dc[15];
1047 t01 = dc[4] + dc[ 8];
1048 t11 = dc[5] + dc[ 9];
1049 t21 = dc[6] + dc[10];
1050 t31 = dc[7] + dc[11];
1052 t02 = dc[4] - dc[ 8];
1053 t12 = dc[5] - dc[ 9];
1054 t22 = dc[6] - dc[10];
1055 t32 = dc[7] - dc[11];
1077 block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1078 block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1079 block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1080 block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1082 block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1083 block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1084 block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1085 block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1087 block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1088 block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1089 block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1090 block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1092 block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1093 block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1094 block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1095 block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1104 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1106 int val = (dc[0] + 3) >> 3;
1110 block[0][0][0] = val;
1111 block[0][1][0] = val;
1112 block[0][2][0] = val;
1113 block[0][3][0] = val;
1114 block[1][0][0] = val;
1115 block[1][1][0] = val;
1116 block[1][2][0] = val;
1117 block[1][3][0] = val;
1118 block[2][0][0] = val;
1119 block[2][1][0] = val;
1120 block[2][2][0] = val;
1121 block[2][3][0] = val;
1122 block[3][0][0] = val;
1123 block[3][1][0] = val;
1124 block[3][2][0] = val;
1125 block[3][3][0] = val;
1128 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1131 DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1132 DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1139 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1140 MMI_LDC1(%[ftmp1], %[block], 0x00)
1141 MMI_LDC1(%[ftmp2], %[block], 0x08)
1142 MMI_LDC1(%[ftmp3], %[block], 0x10)
1143 MMI_LDC1(%[ftmp4], %[block], 0x18)
1145 "li %[tmp0], 0x02 \n\t"
1146 "mtc1 %[tmp0], %[ftmp11] \n\t"
1148 // block[0...3] + block[8...11]
1149 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1150 // block[0...3] - block[8...11]
1151 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1152 // MUL_35468(block[12...15])
1153 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1154 "pmulhh %[ftmp7], %[ftmp9], %[ff_ph_22a3] \n\t"
1155 // MUL_35468(block[4...7])
1156 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1157 "pmulhh %[ftmp8], %[ftmp9], %[ff_ph_22a3] \n\t"
1158 // MUL_20091(block[4...7]
1159 "pmulhh %[ftmp9], %[ftmp2], %[ff_ph_4e7b] \n\t"
1160 "paddh %[ftmp9], %[ftmp9], %[ftmp2] \n\t"
1161 // MUL_20091(block[12...15])
1162 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1163 "paddh %[ftmp10], %[ftmp10], %[ftmp4] \n\t"
1166 "paddh %[ftmp1], %[ftmp5], %[ftmp7] \n\t"
1167 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
1169 "paddh %[ftmp2], %[ftmp6], %[ftmp8] \n\t"
1170 "psubh %[ftmp2], %[ftmp2], %[ftmp10] \n\t"
1172 "psubh %[ftmp3], %[ftmp6], %[ftmp8] \n\t"
1173 "paddh %[ftmp3], %[ftmp3], %[ftmp10] \n\t"
1175 "psubh %[ftmp4], %[ftmp5], %[ftmp7] \n\t"
1176 "psubh %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
1178 MMI_SDC1(%[ftmp0], %[block], 0x00)
1179 MMI_SDC1(%[ftmp0], %[block], 0x08)
1180 MMI_SDC1(%[ftmp0], %[block], 0x10)
1181 MMI_SDC1(%[ftmp0], %[block], 0x18)
1183 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1184 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1187 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
1189 "psubh %[ftmp6], %[ftmp1], %[ftmp3] \n\t"
1191 "psllh %[ftmp9], %[ftmp2], %[ftmp11] \n\t"
1192 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1193 "psubh %[ftmp7], %[ftmp9], %[ftmp4] \n\t"
1194 "pmulhh %[ftmp10], %[ftmp4], %[ff_ph_4e7b] \n\t"
1195 "psubh %[ftmp7], %[ftmp7], %[ftmp10] \n\t"
1197 "psllh %[ftmp9], %[ftmp4], %[ftmp11] \n\t"
1198 "pmulhh %[ftmp9], %[ftmp9], %[ff_ph_22a3] \n\t"
1199 "paddh %[ftmp8], %[ftmp9], %[ftmp2] \n\t"
1200 "pmulhh %[ftmp10], %[ftmp2], %[ff_ph_4e7b] \n\t"
1201 "paddh %[ftmp8], %[ftmp8], %[ftmp10] \n\t"
1203 "li %[tmp0], 0x03 \n\t"
1204 "mtc1 %[tmp0], %[ftmp11] \n\t"
1205 "paddh %[ftmp1], %[ftmp5], %[ftmp8] \n\t"
1206 "paddh %[ftmp1], %[ftmp1], %[ff_pw_4] \n\t"
1207 "psrah %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1208 "paddh %[ftmp2], %[ftmp6], %[ftmp7] \n\t"
1209 "paddh %[ftmp2], %[ftmp2], %[ff_pw_4] \n\t"
1210 "psrah %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
1211 "psubh %[ftmp3], %[ftmp6], %[ftmp7] \n\t"
1212 "paddh %[ftmp3], %[ftmp3], %[ff_pw_4] \n\t"
1213 "psrah %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
1214 "psubh %[ftmp4], %[ftmp5], %[ftmp8] \n\t"
1215 "paddh %[ftmp4], %[ftmp4], %[ff_pw_4] \n\t"
1216 "psrah %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
1218 TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1219 %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1221 MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1222 MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1223 MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1224 MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1226 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1227 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1228 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1229 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1231 "paddh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1232 "paddh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
1233 "paddh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
1234 "paddh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
1236 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1237 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1238 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1239 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1241 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1242 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1243 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1244 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1245 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1246 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1247 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1248 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1249 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1250 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1254 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1255 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1256 [block]"r"(block), [ff_pw_4]"f"(ff_pw_4),
1257 [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_22a3]"f"(ff_ph_22a3)
1261 int i, t0, t1, t2, t3;
1264 for (i = 0; i < 4; i++) {
1265 t0 = block[0 + i] + block[8 + i];
1266 t1 = block[0 + i] - block[8 + i];
1267 t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1268 t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1274 tmp[i * 4 + 0] = t0 + t3;
1275 tmp[i * 4 + 1] = t1 + t2;
1276 tmp[i * 4 + 2] = t1 - t2;
1277 tmp[i * 4 + 3] = t0 - t3;
1280 for (i = 0; i < 4; i++) {
1281 t0 = tmp[0 + i] + tmp[8 + i];
1282 t1 = tmp[0 + i] - tmp[8 + i];
1283 t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1284 t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1286 dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1287 dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1288 dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1289 dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1295 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1298 int dc = (block[0] + 4) >> 3;
1305 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1306 "mtc1 %[dc], %[ftmp5] \n\t"
1307 MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1308 MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1309 MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1310 MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1311 "pshufh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1312 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1313 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1314 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1315 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1316 "paddsh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
1317 "paddsh %[ftmp2], %[ftmp2], %[ftmp5] \n\t"
1318 "paddsh %[ftmp3], %[ftmp3], %[ftmp5] \n\t"
1319 "paddsh %[ftmp4], %[ftmp4], %[ftmp5] \n\t"
1320 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1321 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1322 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1323 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1324 MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1325 MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1326 MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1327 MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1328 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1329 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1330 [ftmp4]"=&f"(ftmp[4]),
1332 [ftmp5]"=&f"(ftmp[5])
1333 : [dst0]"r"(dst), [dst1]"r"(dst+stride),
1334 [dst2]"r"(dst+2*stride), [dst3]"r"(dst+3*stride),
1339 int i, dc = (block[0] + 4) >> 3;
1343 for (i = 0; i < 4; i++) {
1344 dst[0] = av_clip_uint8(dst[0] + dc);
1345 dst[1] = av_clip_uint8(dst[1] + dc);
1346 dst[2] = av_clip_uint8(dst[2] + dc);
1347 dst[3] = av_clip_uint8(dst[3] + dc);
1353 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1356 ff_vp8_idct_dc_add_mmi(dst + 0, block[0], stride);
1357 ff_vp8_idct_dc_add_mmi(dst + 4, block[1], stride);
1358 ff_vp8_idct_dc_add_mmi(dst + 8, block[2], stride);
1359 ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1362 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1365 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1366 ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1367 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1368 ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1371 // loop filter applied to edges between macroblocks
1372 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1373 int flim_I, int hev_thresh)
1375 vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1376 vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1379 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1380 int flim_I, int hev_thresh)
1382 vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1383 vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1387 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1388 int flim_E, int flim_I, int hev_thresh)
1390 vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1391 vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1394 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1395 int flim_E, int flim_I, int hev_thresh)
1397 vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1398 vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1401 // loop filter applied to inner macroblock edges
1402 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1403 int flim_E, int flim_I, int hev_thresh)
1407 for (i = 0; i < 16; i++)
1408 if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1409 int hv = hev(dst + i * 1, stride, hev_thresh);
1411 vp8_filter_common_is4tap(dst + i * 1, stride);
1413 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1417 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1418 int flim_E, int flim_I, int hev_thresh)
1422 for (i = 0; i < 16; i++)
1423 if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1424 int hv = hev(dst + i * stride, 1, hev_thresh);
1426 vp8_filter_common_is4tap(dst + i * stride, 1);
1428 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1432 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1433 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1435 vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1436 vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1439 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1440 ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1442 vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1443 vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1446 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1450 for (i = 0; i < 16; i++)
1451 if (vp8_simple_limit(dst + i, stride, flim))
1452 vp8_filter_common_is4tap(dst + i, stride);
1455 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1459 for (i = 0; i < 16; i++)
1460 if (vp8_simple_limit(dst + i * stride, 1, flim))
1461 vp8_filter_common_is4tap(dst + i * stride, 1);
1464 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1465 ptrdiff_t srcstride, int h, int x, int y)
1475 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1476 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1477 "ldl %[tmp0], 0x0f(%[src]) \n\t"
1478 "ldr %[tmp0], 0x08(%[src]) \n\t"
1479 MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1480 "ldl %[tmp1], 0x0f(%[addr0]) \n\t"
1481 "ldr %[tmp1], 0x08(%[addr0]) \n\t"
1482 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1483 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1484 "sdl %[tmp0], 0x0f(%[dst]) \n\t"
1485 "sdr %[tmp0], 0x08(%[dst]) \n\t"
1486 "addiu %[h], %[h], -0x02 \n\t"
1487 MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1488 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1489 "sdl %[tmp1], 0x0f(%[addr1]) \n\t"
1490 "sdr %[tmp1], 0x08(%[addr1]) \n\t"
1491 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1492 "bnez %[h], 1b \n\t"
1493 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1494 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
1496 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1497 [dst]"+&r"(dst), [src]"+&r"(src),
1499 : [dststride]"r"((mips_reg)dststride),
1500 [srcstride]"r"((mips_reg)srcstride)
1506 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1507 memcpy(dst, src, 16);
1511 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1512 ptrdiff_t srcstride, int h, int x, int y)
1522 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1523 MMI_ULDC1(%[ftmp0], %[src], 0x00)
1524 "ldl %[tmp0], 0x07(%[addr0]) \n\t"
1525 "ldr %[tmp0], 0x00(%[addr0]) \n\t"
1526 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1527 MMI_SDC1(%[ftmp0], %[dst], 0x00)
1528 "addiu %[h], %[h], -0x02 \n\t"
1529 "sdl %[tmp0], 0x07(%[addr1]) \n\t"
1530 "sdr %[tmp0], 0x00(%[addr1]) \n\t"
1531 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1532 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1533 "bnez %[h], 1b \n\t"
1534 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1536 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1537 [dst]"+&r"(dst), [src]"+&r"(src),
1539 : [dststride]"r"((mips_reg)dststride),
1540 [srcstride]"r"((mips_reg)srcstride)
1546 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1547 memcpy(dst, src, 8);
1551 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1552 ptrdiff_t srcstride, int h, int x, int y)
1562 PTR_ADDU "%[addr0], %[src], %[srcstride] \n\t"
1563 MMI_LWC1(%[ftmp0], %[src], 0x00)
1564 "lwl %[tmp0], 0x03(%[addr0]) \n\t"
1565 "lwr %[tmp0], 0x00(%[addr0]) \n\t"
1566 PTR_ADDU "%[addr1], %[dst], %[dststride] \n\t"
1567 MMI_SWC1(%[ftmp0], %[dst], 0x00)
1568 "addiu %[h], %[h], -0x02 \n\t"
1569 "swl %[tmp0], 0x03(%[addr1]) \n\t"
1570 "swr %[tmp0], 0x00(%[addr1]) \n\t"
1571 PTR_ADDU "%[src], %[addr0], %[srcstride] \n\t"
1572 PTR_ADDU "%[dst], %[addr1], %[dststride] \n\t"
1573 "bnez %[h], 1b \n\t"
1574 : [ftmp0]"=&f"(ftmp[0]), [tmp0]"=&r"(tmp[0]),
1576 [addr0]"=&r"(addr[0]), [addr1]"=&r"(addr[1]),
1577 [dst]"+&r"(dst), [src]"+&r"(src),
1579 : [dststride]"r"((mips_reg)dststride),
1580 [srcstride]"r"((mips_reg)srcstride)
1586 for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1587 memcpy(dst, src, 4);
1591 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1592 ptrdiff_t srcstride, int h, int mx, int my)
1595 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1598 mips_reg src1, dst1;
1602 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1603 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1604 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1605 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1606 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1607 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1608 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1609 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1611 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1612 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1613 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1614 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1615 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1616 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1617 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1618 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1621 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1622 "li %[tmp0], 0x07 \n\t"
1623 "mtc1 %[tmp0], %[ftmp4] \n\t"
1627 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1628 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1629 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1631 PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1633 "addiu %[h], %[h], -0x01 \n\t"
1634 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1635 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1636 "bnez %[h], 1b \n\t"
1637 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1638 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1639 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1640 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1641 [ftmp8]"=&f"(ftmp[8]),
1642 [tmp0]"=&r"(tmp[0]),
1644 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1646 [dst]"+&r"(dst), [src]"+&r"(src)
1647 : [ff_pw_64]"f"(ff_pw_64),
1648 [srcstride]"r"((mips_reg)srcstride),
1649 [dststride]"r"((mips_reg)dststride),
1650 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1651 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1655 const uint8_t *filter = subpel_filters[mx - 1];
1656 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1659 for (y = 0; y < h; y++) {
1660 for (x = 0; x < 16; x++)
1661 dst[x] = FILTER_4TAP(src, filter, 1);
1668 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1669 ptrdiff_t srcstride, int h, int mx, int my)
1672 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1678 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1679 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1680 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1681 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1682 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1683 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1684 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1685 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1688 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1689 "li %[tmp0], 0x07 \n\t"
1690 "mtc1 %[tmp0], %[ftmp4] \n\t"
1693 PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1695 "addiu %[h], %[h], -0x01 \n\t"
1696 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1697 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1698 "bnez %[h], 1b \n\t"
1699 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1700 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1701 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1702 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1703 [ftmp8]"=&f"(ftmp[8]),
1704 [tmp0]"=&r"(tmp[0]),
1707 [dst]"+&r"(dst), [src]"+&r"(src)
1708 : [ff_pw_64]"f"(ff_pw_64),
1709 [srcstride]"r"((mips_reg)srcstride),
1710 [dststride]"r"((mips_reg)dststride),
1711 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1712 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1716 const uint8_t *filter = subpel_filters[mx - 1];
1717 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1720 for (y = 0; y < h; y++) {
1721 for (x = 0; x < 8; x++)
1722 dst[x] = FILTER_4TAP(src, filter, 1);
1729 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1730 ptrdiff_t srcstride, int h, int mx, int my)
1733 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1739 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1740 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1741 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1742 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1745 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1746 "li %[tmp0], 0x07 \n\t"
1747 "mtc1 %[tmp0], %[ftmp4] \n\t"
1750 PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1752 "addiu %[h], %[h], -0x01 \n\t"
1753 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1754 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1755 "bnez %[h], 1b \n\t"
1756 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1757 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1758 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1759 [tmp0]"=&r"(tmp[0]),
1762 [dst]"+&r"(dst), [src]"+&r"(src)
1763 : [ff_pw_64]"f"(ff_pw_64),
1764 [srcstride]"r"((mips_reg)srcstride),
1765 [dststride]"r"((mips_reg)dststride),
1766 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
1767 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
1771 const uint8_t *filter = subpel_filters[mx - 1];
1772 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1775 for (y = 0; y < h; y++) {
1776 for (x = 0; x < 4; x++)
1777 dst[x] = FILTER_4TAP(src, filter, 1);
1784 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1785 ptrdiff_t srcstride, int h, int mx, int my)
1788 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1791 mips_reg src1, dst1;
1795 dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1796 dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1797 dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1798 dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1799 dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1800 dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1801 dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1802 dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1804 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1805 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1806 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1807 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1808 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1809 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1810 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1811 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1814 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1815 "li %[tmp0], 0x07 \n\t"
1816 "mtc1 %[tmp0], %[ftmp4] \n\t"
1820 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1821 PTR_ADDIU "%[src1], %[src], 0x08 \n\t"
1822 PTR_ADDIU "%[dst1], %[dst], 0x08 \n\t"
1824 PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1826 "addiu %[h], %[h], -0x01 \n\t"
1827 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1828 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1829 "bnez %[h], 1b \n\t"
1830 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1831 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1832 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1833 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1834 [ftmp8]"=&f"(ftmp[8]),
1835 [tmp0]"=&r"(tmp[0]),
1837 [dst1]"=&r"(dst1), [src1]"=&r"(src1),
1839 [dst]"+&r"(dst), [src]"+&r"(src)
1840 : [ff_pw_64]"f"(ff_pw_64),
1841 [srcstride]"r"((mips_reg)srcstride),
1842 [dststride]"r"((mips_reg)dststride),
1843 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1844 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1845 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1849 const uint8_t *filter = subpel_filters[mx - 1];
1850 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1853 for (y = 0; y < h; y++) {
1854 for (x = 0; x < 16; x++)
1855 dst[x] = FILTER_6TAP(src, filter, 1);
1862 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1863 ptrdiff_t srcstride, int h, int mx, int my)
1866 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1872 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1873 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1874 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1875 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1876 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1877 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1878 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1879 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1882 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1883 "li %[tmp0], 0x07 \n\t"
1884 "mtc1 %[tmp0], %[ftmp4] \n\t"
1887 PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1889 "addiu %[h], %[h], -0x01 \n\t"
1890 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1891 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1892 "bnez %[h], 1b \n\t"
1893 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1894 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1895 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1896 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1897 [ftmp8]"=&f"(ftmp[8]),
1898 [tmp0]"=&r"(tmp[0]),
1901 [dst]"+&r"(dst), [src]"+&r"(src)
1902 : [ff_pw_64]"f"(ff_pw_64),
1903 [srcstride]"r"((mips_reg)srcstride),
1904 [dststride]"r"((mips_reg)dststride),
1905 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1906 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1907 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1911 const uint8_t *filter = subpel_filters[mx - 1];
1912 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1915 for (y = 0; y < h; y++) {
1916 for (x = 0; x < 8; x++)
1917 dst[x] = FILTER_6TAP(src, filter, 1);
1924 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1925 ptrdiff_t srcstride, int h, int mx, int my)
1928 const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1934 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1935 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1936 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1937 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1940 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1941 "li %[tmp0], 0x07 \n\t"
1942 "mtc1 %[tmp0], %[ftmp4] \n\t"
1945 PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1947 "addiu %[h], %[h], -0x01 \n\t"
1948 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
1949 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
1950 "bnez %[h], 1b \n\t"
1951 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1952 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1953 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1954 [tmp0]"=&r"(tmp[0]),
1957 [dst]"+&r"(dst), [src]"+&r"(src)
1958 : [ff_pw_64]"f"(ff_pw_64),
1959 [srcstride]"r"((mips_reg)srcstride),
1960 [dststride]"r"((mips_reg)dststride),
1961 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
1962 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
1963 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
1967 const uint8_t *filter = subpel_filters[mx - 1];
1968 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1971 for (y = 0; y < h; y++) {
1972 for (x = 0; x < 4; x++)
1973 dst[x] = FILTER_6TAP(src, filter, 1);
1980 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1981 ptrdiff_t srcstride, int h, int mx, int my)
1984 const uint64_t *filter = fourtap_subpel_filters[my - 1];
1987 mips_reg src0, src1, dst0;
1991 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
1992 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1993 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1994 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1995 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1996 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1997 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1998 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2000 dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2001 dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2002 dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2003 dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2004 dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2005 dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2006 dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2007 dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2010 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2011 "li %[tmp0], 0x07 \n\t"
2012 "mtc1 %[tmp0], %[ftmp4] \n\t"
2016 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2017 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2018 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2020 PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2022 "addiu %[h], %[h], -0x01 \n\t"
2023 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2024 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2025 "bnez %[h], 1b \n\t"
2026 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2027 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2028 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2029 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2030 [ftmp8]"=&f"(ftmp[8]),
2031 [tmp0]"=&r"(tmp[0]),
2033 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2036 [dst]"+&r"(dst), [src]"+&r"(src)
2037 : [ff_pw_64]"f"(ff_pw_64),
2038 [srcstride]"r"((mips_reg)srcstride),
2039 [dststride]"r"((mips_reg)dststride),
2040 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2041 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2045 const uint8_t *filter = subpel_filters[my - 1];
2046 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2049 for (y = 0; y < h; y++) {
2050 for (x = 0; x < 16; x++)
2051 dst[x] = FILTER_4TAP(src, filter, srcstride);
2058 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2059 ptrdiff_t srcstride, int h, int mx, int my)
2062 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2069 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2070 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2071 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2072 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2073 dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2074 dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2075 dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2076 dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2079 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2080 "li %[tmp0], 0x07 \n\t"
2081 "mtc1 %[tmp0], %[ftmp4] \n\t"
2084 PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2086 "addiu %[h], %[h], -0x01 \n\t"
2087 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2088 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2089 "bnez %[h], 1b \n\t"
2090 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2091 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2092 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2093 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2094 [ftmp8]"=&f"(ftmp[8]),
2095 [tmp0]"=&r"(tmp[0]),
2099 [dst]"+&r"(dst), [src]"+&r"(src)
2100 : [ff_pw_64]"f"(ff_pw_64),
2101 [srcstride]"r"((mips_reg)srcstride),
2102 [dststride]"r"((mips_reg)dststride),
2103 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2104 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2108 const uint8_t *filter = subpel_filters[my - 1];
2109 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2112 for (y = 0; y < h; y++) {
2113 for (x = 0; x < 8; x++)
2114 dst[x] = FILTER_4TAP(src, filter, srcstride);
2121 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2122 ptrdiff_t srcstride, int h, int mx, int my)
2125 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2132 dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[ srcstride] - filter[4] * src[ 2*srcstride] + 64) >> 7];
2133 dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2134 dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2135 dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2138 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2139 "li %[tmp0], 0x07 \n\t"
2140 "mtc1 %[tmp0], %[ftmp4] \n\t"
2143 PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2145 "addiu %[h], %[h], -0x01 \n\t"
2146 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2147 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2148 "bnez %[h], 1b \n\t"
2149 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2150 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2151 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2152 [tmp0]"=&r"(tmp[0]),
2156 [dst]"+&r"(dst), [src]"+&r"(src)
2157 : [ff_pw_64]"f"(ff_pw_64),
2158 [srcstride]"r"((mips_reg)srcstride),
2159 [dststride]"r"((mips_reg)dststride),
2160 [filter1]"f"(filter[1]), [filter2]"f"(filter[2]),
2161 [filter3]"f"(filter[3]), [filter4]"f"(filter[4])
2165 const uint8_t *filter = subpel_filters[my - 1];
2166 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2169 for (y = 0; y < h; y++) {
2170 for (x = 0; x < 4; x++)
2171 dst[x] = FILTER_4TAP(src, filter, srcstride);
2178 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2179 ptrdiff_t srcstride, int h, int mx, int my)
2182 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2185 mips_reg src0, src1, dst0;
2189 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2190 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2191 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2192 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2193 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2194 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2195 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2196 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2198 dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2199 dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2200 dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2201 dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2202 dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2203 dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2204 dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2205 dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2208 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2209 "li %[tmp0], 0x07 \n\t"
2210 "mtc1 %[tmp0], %[ftmp4] \n\t"
2214 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2215 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2216 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2218 PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2220 "addiu %[h], %[h], -0x01 \n\t"
2221 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2222 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2223 "bnez %[h], 1b \n\t"
2224 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2225 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2226 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2227 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2228 [ftmp8]"=&f"(ftmp[8]),
2229 [tmp0]"=&r"(tmp[0]),
2231 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2234 [dst]"+&r"(dst), [src]"+&r"(src)
2235 : [ff_pw_64]"f"(ff_pw_64),
2236 [srcstride]"r"((mips_reg)srcstride),
2237 [dststride]"r"((mips_reg)dststride),
2238 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2239 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2240 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2244 const uint8_t *filter = subpel_filters[my - 1];
2245 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2248 for (y = 0; y < h; y++) {
2249 for (x = 0; x < 16; x++)
2250 dst[x] = FILTER_6TAP(src, filter, srcstride);
2257 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2258 ptrdiff_t srcstride, int h, int mx, int my)
2261 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2268 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2269 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2270 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2271 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2272 dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2273 dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2274 dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2275 dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2278 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2279 "li %[tmp0], 0x07 \n\t"
2280 "mtc1 %[tmp0], %[ftmp4] \n\t"
2283 PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2285 "addiu %[h], %[h], -0x01 \n\t"
2286 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2287 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2288 "bnez %[h], 1b \n\t"
2289 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2290 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2291 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2292 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2293 [ftmp8]"=&f"(ftmp[8]),
2294 [tmp0]"=&r"(tmp[0]),
2298 [dst]"+&r"(dst), [src]"+&r"(src)
2299 : [ff_pw_64]"f"(ff_pw_64),
2300 [srcstride]"r"((mips_reg)srcstride),
2301 [dststride]"r"((mips_reg)dststride),
2302 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2303 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2304 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2308 const uint8_t *filter = subpel_filters[my - 1];
2309 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2312 for (y = 0; y < h; y++) {
2313 for (x = 0; x < 8; x++)
2314 dst[x] = FILTER_6TAP(src, filter, srcstride);
2321 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2322 ptrdiff_t srcstride, int h, int mx, int my)
2325 const uint64_t *filter = fourtap_subpel_filters[my - 1];
2332 dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2333 dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2334 dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2335 dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2338 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2339 "li %[tmp0], 0x07 \n\t"
2340 "mtc1 %[tmp0], %[ftmp4] \n\t"
2343 PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2345 "addiu %[h], %[h], -0x01 \n\t"
2346 PTR_ADDU "%[src], %[src], %[srcstride] \n\t"
2347 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t"
2348 "bnez %[h], 1b \n\t"
2349 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2350 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2351 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2352 [tmp0]"=&r"(tmp[0]),
2356 [dst]"+&r"(dst), [src]"+&r"(src)
2357 : [ff_pw_64]"f"(ff_pw_64),
2358 [srcstride]"r"((mips_reg)srcstride),
2359 [dststride]"r"((mips_reg)dststride),
2360 [filter0]"f"(filter[0]), [filter1]"f"(filter[1]),
2361 [filter2]"f"(filter[2]), [filter3]"f"(filter[3]),
2362 [filter4]"f"(filter[4]), [filter5]"f"(filter[5])
2366 const uint8_t *filter = subpel_filters[my - 1];
2367 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2370 for (y = 0; y < h; y++) {
2371 for (x = 0; x < 4; x++)
2372 dst[x] = FILTER_6TAP(src, filter, srcstride);
2379 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2380 ptrdiff_t srcstride, int h, int mx, int my)
2383 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2384 uint8_t *tmp = tmp_array;
2387 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2388 tmp = tmp_array + 16;
2389 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2391 const uint8_t *filter = subpel_filters[mx - 1];
2392 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2394 uint8_t tmp_array[560];
2395 uint8_t *tmp = tmp_array;
2399 for (y = 0; y < h + 3; y++) {
2400 for (x = 0; x < 16; x++)
2401 tmp[x] = FILTER_4TAP(src, filter, 1);
2406 tmp = tmp_array + 16;
2407 filter = subpel_filters[my - 1];
2409 for (y = 0; y < h; y++) {
2410 for (x = 0; x < 16; x++)
2411 dst[x] = FILTER_4TAP(tmp, filter, 16);
2418 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2419 ptrdiff_t srcstride, int h, int mx, int my)
2422 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2423 uint8_t *tmp = tmp_array;
2426 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2427 tmp = tmp_array + 8;
2428 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2430 const uint8_t *filter = subpel_filters[mx - 1];
2431 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2433 uint8_t tmp_array[152];
2434 uint8_t *tmp = tmp_array;
2438 for (y = 0; y < h + 3; y++) {
2439 for (x = 0; x < 8; x++)
2440 tmp[x] = FILTER_4TAP(src, filter, 1);
2445 tmp = tmp_array + 8;
2446 filter = subpel_filters[my - 1];
2448 for (y = 0; y < h; y++) {
2449 for (x = 0; x < 8; x++)
2450 dst[x] = FILTER_4TAP(tmp, filter, 8);
2457 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2458 ptrdiff_t srcstride, int h, int mx, int my)
2461 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2462 uint8_t *tmp = tmp_array;
2465 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2466 tmp = tmp_array + 4;
2467 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2469 const uint8_t *filter = subpel_filters[mx - 1];
2470 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2472 uint8_t tmp_array[44];
2473 uint8_t *tmp = tmp_array;
2477 for (y = 0; y < h + 3; y++) {
2478 for (x = 0; x < 4; x++)
2479 tmp[x] = FILTER_4TAP(src, filter, 1);
2483 tmp = tmp_array + 4;
2484 filter = subpel_filters[my - 1];
2486 for (y = 0; y < h; y++) {
2487 for (x = 0; x < 4; x++)
2488 dst[x] = FILTER_4TAP(tmp, filter, 4);
2495 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2496 ptrdiff_t srcstride, int h, int mx, int my)
2499 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2500 uint8_t *tmp = tmp_array;
2502 src -= 2 * srcstride;
2503 ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2504 tmp = tmp_array + 32;
2505 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2507 const uint8_t *filter = subpel_filters[mx - 1];
2508 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2510 uint8_t tmp_array[592];
2511 uint8_t *tmp = tmp_array;
2513 src -= 2 * srcstride;
2515 for (y = 0; y < h + 5; y++) {
2516 for (x = 0; x < 16; x++)
2517 tmp[x] = FILTER_4TAP(src, filter, 1);
2522 tmp = tmp_array + 32;
2523 filter = subpel_filters[my - 1];
2525 for (y = 0; y < h; y++) {
2526 for (x = 0; x < 16; x++)
2527 dst[x] = FILTER_6TAP(tmp, filter, 16);
2534 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2535 ptrdiff_t srcstride, int h, int mx, int my)
2538 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2539 uint8_t *tmp = tmp_array;
2541 src -= 2 * srcstride;
2542 ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2543 tmp = tmp_array + 16;
2544 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2546 const uint8_t *filter = subpel_filters[mx - 1];
2547 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2549 uint8_t tmp_array[168];
2550 uint8_t *tmp = tmp_array;
2552 src -= 2 * srcstride;
2554 for (y = 0; y < h + 5; y++) {
2555 for (x = 0; x < 8; x++)
2556 tmp[x] = FILTER_4TAP(src, filter, 1);
2561 tmp = tmp_array + 16;
2562 filter = subpel_filters[my - 1];
2564 for (y = 0; y < h; y++) {
2565 for (x = 0; x < 8; x++)
2566 dst[x] = FILTER_6TAP(tmp, filter, 8);
2573 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2574 ptrdiff_t srcstride, int h, int mx, int my)
2577 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2578 uint8_t *tmp = tmp_array;
2580 src -= 2 * srcstride;
2581 ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2582 tmp = tmp_array + 8;
2583 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2585 const uint8_t *filter = subpel_filters[mx - 1];
2586 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2588 uint8_t tmp_array[52];
2589 uint8_t *tmp = tmp_array;
2591 src -= 2 * srcstride;
2593 for (y = 0; y < h + 5; y++) {
2594 for (x = 0; x < 4; x++)
2595 tmp[x] = FILTER_4TAP(src, filter, 1);
2600 tmp = tmp_array + 8;
2601 filter = subpel_filters[my - 1];
2603 for (y = 0; y < h; y++) {
2604 for (x = 0; x < 4; x++)
2605 dst[x] = FILTER_6TAP(tmp, filter, 4);
2612 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2613 ptrdiff_t srcstride, int h, int mx, int my)
2616 DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2617 uint8_t *tmp = tmp_array;
2620 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2621 tmp = tmp_array + 16;
2622 ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2624 const uint8_t *filter = subpel_filters[mx - 1];
2625 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2627 uint8_t tmp_array[560];
2628 uint8_t *tmp = tmp_array;
2632 for (y = 0; y < h + 3; y++) {
2633 for (x = 0; x < 16; x++)
2634 tmp[x] = FILTER_6TAP(src, filter, 1);
2639 tmp = tmp_array + 16;
2640 filter = subpel_filters[my - 1];
2642 for (y = 0; y < h; y++) {
2643 for (x = 0; x < 16; x++)
2644 dst[x] = FILTER_4TAP(tmp, filter, 16);
2651 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2652 ptrdiff_t srcstride, int h, int mx, int my)
2655 DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2656 uint8_t *tmp = tmp_array;
2659 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2660 tmp = tmp_array + 8;
2661 ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2663 const uint8_t *filter = subpel_filters[mx - 1];
2664 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2666 uint8_t tmp_array[152];
2667 uint8_t *tmp = tmp_array;
2671 for (y = 0; y < h + 3; y++) {
2672 for (x = 0; x < 8; x++)
2673 tmp[x] = FILTER_6TAP(src, filter, 1);
2678 tmp = tmp_array + 8;
2679 filter = subpel_filters[my - 1];
2681 for (y = 0; y < h; y++) {
2682 for (x = 0; x < 8; x++)
2683 dst[x] = FILTER_4TAP(tmp, filter, 8);
2690 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2691 ptrdiff_t srcstride, int h, int mx, int my)
2694 DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2695 uint8_t *tmp = tmp_array;
2698 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2699 tmp = tmp_array + 4;
2700 ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2702 const uint8_t *filter = subpel_filters[mx - 1];
2703 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2705 uint8_t tmp_array[44];
2706 uint8_t *tmp = tmp_array;
2710 for (y = 0; y < h + 3; y++) {
2711 for (x = 0; x < 4; x++)
2712 tmp[x] = FILTER_6TAP(src, filter, 1);
2717 tmp = tmp_array + 4;
2718 filter = subpel_filters[my - 1];
2720 for (y = 0; y < h; y++) {
2721 for (x = 0; x < 4; x++)
2722 dst[x] = FILTER_4TAP(tmp, filter, 4);
2729 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2730 ptrdiff_t srcstride, int h, int mx, int my)
2733 DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2734 uint8_t *tmp = tmp_array;
2736 src -= 2 * srcstride;
2737 ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2738 tmp = tmp_array + 32;
2739 ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2741 const uint8_t *filter = subpel_filters[mx - 1];
2742 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2744 uint8_t tmp_array[592];
2745 uint8_t *tmp = tmp_array;
2747 src -= 2 * srcstride;
2749 for (y = 0; y < h + 5; y++) {
2750 for (x = 0; x < 16; x++)
2751 tmp[x] = FILTER_6TAP(src, filter, 1);
2756 tmp = tmp_array + 32;
2757 filter = subpel_filters[my - 1];
2759 for (y = 0; y < h; y++) {
2760 for (x = 0; x < 16; x++)
2761 dst[x] = FILTER_6TAP(tmp, filter, 16);
2768 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2769 ptrdiff_t srcstride, int h, int mx, int my)
2772 DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2773 uint8_t *tmp = tmp_array;
2775 src -= 2 * srcstride;
2776 ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2777 tmp = tmp_array + 16;
2778 ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2780 const uint8_t *filter = subpel_filters[mx - 1];
2781 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2783 uint8_t tmp_array[168];
2784 uint8_t *tmp = tmp_array;
2786 src -= 2 * srcstride;
2788 for (y = 0; y < h + 5; y++) {
2789 for (x = 0; x < 8; x++)
2790 tmp[x] = FILTER_6TAP(src, filter, 1);
2795 tmp = tmp_array + 16;
2796 filter = subpel_filters[my - 1];
2798 for (y = 0; y < h; y++) {
2799 for (x = 0; x < 8; x++)
2800 dst[x] = FILTER_6TAP(tmp, filter, 8);
2807 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2808 ptrdiff_t srcstride, int h, int mx, int my)
2811 DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2812 uint8_t *tmp = tmp_array;
2814 src -= 2 * srcstride;
2815 ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2816 tmp = tmp_array + 8;
2817 ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2819 const uint8_t *filter = subpel_filters[mx - 1];
2820 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
2822 uint8_t tmp_array[52];
2823 uint8_t *tmp = tmp_array;
2825 src -= 2 * srcstride;
2827 for (y = 0; y < h + 5; y++) {
2828 for (x = 0; x < 4; x++)
2829 tmp[x] = FILTER_6TAP(src, filter, 1);
2834 tmp = tmp_array + 8;
2835 filter = subpel_filters[my - 1];
2837 for (y = 0; y < h; y++) {
2838 for (x = 0; x < 4; x++)
2839 dst[x] = FILTER_6TAP(tmp, filter, 4);
2846 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2847 ptrdiff_t sstride, int h, int mx, int my)
2850 int a = 8 - mx, b = mx;
2853 mips_reg dst0, src0;
2857 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2858 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2859 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2860 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2861 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2862 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2863 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2864 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2866 dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2867 dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2868 dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2869 dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2870 dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2871 dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2872 dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2873 dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2876 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2877 "li %[tmp0], 0x03 \n\t"
2878 "mtc1 %[tmp0], %[ftmp4] \n\t"
2879 "pshufh %[a], %[a], %[ftmp0] \n\t"
2880 "pshufh %[b], %[b], %[ftmp0] \n\t"
2884 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2885 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2886 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2888 PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2890 "addiu %[h], %[h], -0x01 \n\t"
2891 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2892 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2893 "bnez %[h], 1b \n\t"
2894 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2895 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2896 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2897 [ftmp6]"=&f"(ftmp[6]),
2898 [tmp0]"=&r"(tmp[0]),
2900 [dst0]"=&r"(dst0), [src0]"=&r"(src0),
2902 [dst]"+&r"(dst), [src]"+&r"(src),
2903 [a]"+&f"(a), [b]"+&f"(b)
2904 : [sstride]"r"((mips_reg)sstride),
2905 [dstride]"r"((mips_reg)dstride),
2906 [ff_pw_4]"f"(ff_pw_4)
2910 int a = 8 - mx, b = mx;
2913 for (y = 0; y < h; y++) {
2914 for (x = 0; x < 16; x++)
2915 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2922 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2923 ptrdiff_t sstride, int h, int mx, int my)
2926 int c = 8 - my, d = my;
2929 mips_reg src0, src1, dst0;
2933 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
2934 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2935 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2936 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2937 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2938 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2939 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2940 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2943 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2944 "li %[tmp0], 0x03 \n\t"
2945 "mtc1 %[tmp0], %[ftmp4] \n\t"
2946 "pshufh %[c], %[c], %[ftmp0] \n\t"
2947 "pshufh %[d], %[d], %[ftmp0] \n\t"
2951 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2952 PTR_ADDIU "%[src0], %[src], 0x08 \n\t"
2953 PTR_ADDIU "%[dst0], %[dst], 0x08 \n\t"
2955 PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2957 "addiu %[h], %[h], -0x01 \n\t"
2958 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
2959 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
2960 "bnez %[h], 1b \n\t"
2961 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2962 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2963 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2964 [ftmp6]"=&f"(ftmp[6]),
2965 [tmp0]"=&r"(tmp[0]),
2967 [src0]"=&r"(src0), [dst0]"=&r"(dst0),
2970 [dst]"+&r"(dst), [src]"+&r"(src),
2971 [c]"+&f"(c), [d]"+&f"(d)
2972 : [sstride]"r"((mips_reg)sstride),
2973 [dstride]"r"((mips_reg)dstride),
2974 [ff_pw_4]"f"(ff_pw_4)
2978 int c = 8 - my, d = my;
2981 for (y = 0; y < h; y++) {
2982 for (x = 0; x < 16; x++)
2983 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2990 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2991 ptrdiff_t sstride, int h, int mx, int my)
2994 DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2995 uint8_t *tmp = tmp_array;
2997 ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2998 ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
3000 int a = 8 - mx, b = mx;
3001 int c = 8 - my, d = my;
3003 uint8_t tmp_array[528];
3004 uint8_t *tmp = tmp_array;
3006 for (y = 0; y < h + 1; y++) {
3007 for (x = 0; x < 16; x++)
3008 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3015 for (y = 0; y < h; y++) {
3016 for (x = 0; x < 16; x++)
3017 dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3024 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3025 ptrdiff_t sstride, int h, int mx, int my)
3028 int a = 8 - mx, b = mx;
3034 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3035 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3036 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3037 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3038 dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3039 dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3040 dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3041 dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3044 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3045 "li %[tmp0], 0x03 \n\t"
3046 "mtc1 %[tmp0], %[ftmp4] \n\t"
3047 "pshufh %[a], %[a], %[ftmp0] \n\t"
3048 "pshufh %[b], %[b], %[ftmp0] \n\t"
3051 PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3053 "addiu %[h], %[h], -0x01 \n\t"
3054 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3055 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3056 "bnez %[h], 1b \n\t"
3057 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3058 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3059 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3060 [ftmp6]"=&f"(ftmp[6]),
3061 [tmp0]"=&r"(tmp[0]),
3064 [dst]"+&r"(dst), [src]"+&r"(src),
3065 [a]"+&f"(a), [b]"+&f"(b)
3066 : [sstride]"r"((mips_reg)sstride),
3067 [dstride]"r"((mips_reg)dstride),
3068 [ff_pw_4]"f"(ff_pw_4)
3072 int a = 8 - mx, b = mx;
3075 for (y = 0; y < h; y++) {
3076 for (x = 0; x < 8; x++)
3077 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3084 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3085 ptrdiff_t sstride, int h, int mx, int my)
3088 int c = 8 - my, d = my;
3095 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3096 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3097 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3098 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3099 dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3100 dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3101 dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3102 dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3105 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3106 "li %[tmp0], 0x03 \n\t"
3107 "mtc1 %[tmp0], %[ftmp4] \n\t"
3108 "pshufh %[c], %[c], %[ftmp0] \n\t"
3109 "pshufh %[d], %[d], %[ftmp0] \n\t"
3112 PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3114 "addiu %[h], %[h], -0x01 \n\t"
3115 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3116 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3117 "bnez %[h], 1b \n\t"
3118 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3119 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3120 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
3121 [ftmp6]"=&f"(ftmp[6]),
3122 [tmp0]"=&r"(tmp[0]),
3126 [dst]"+&r"(dst), [src]"+&r"(src),
3127 [c]"+&f"(c), [d]"+&f"(d)
3128 : [sstride]"r"((mips_reg)sstride),
3129 [dstride]"r"((mips_reg)dstride),
3130 [ff_pw_4]"f"(ff_pw_4)
3134 int c = 8 - my, d = my;
3137 for (y = 0; y < h; y++) {
3138 for (x = 0; x < 8; x++)
3139 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3146 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3147 ptrdiff_t sstride, int h, int mx, int my)
3150 DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3151 uint8_t *tmp = tmp_array;
3153 ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3154 ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3156 int a = 8 - mx, b = mx;
3157 int c = 8 - my, d = my;
3159 uint8_t tmp_array[136];
3160 uint8_t *tmp = tmp_array;
3162 for (y = 0; y < h + 1; y++) {
3163 for (x = 0; x < 8; x++)
3164 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3171 for (y = 0; y < h; y++) {
3172 for (x = 0; x < 8; x++)
3173 dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3180 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3181 ptrdiff_t sstride, int h, int mx, int my)
3184 int a = 8 - mx, b = mx;
3191 dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3192 dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3193 dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3194 dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3197 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3198 "li %[tmp0], 0x03 \n\t"
3199 "mtc1 %[tmp0], %[ftmp4] \n\t"
3200 "pshufh %[a], %[a], %[ftmp0] \n\t"
3201 "pshufh %[b], %[b], %[ftmp0] \n\t"
3204 PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3206 "addiu %[h], %[h], -0x01 \n\t"
3207 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3208 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3209 "bnez %[h], 1b \n\t"
3210 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3211 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3212 [ftmp4]"=&f"(ftmp[4]),
3213 [tmp0]"=&r"(tmp[0]),
3217 [dst]"+&r"(dst), [src]"+&r"(src),
3218 [a]"+&f"(a), [b]"+&f"(b)
3219 : [sstride]"r"((mips_reg)sstride),
3220 [dstride]"r"((mips_reg)dstride),
3221 [ff_pw_4]"f"(ff_pw_4)
3225 int a = 8 - mx, b = mx;
3228 for (y = 0; y < h; y++) {
3229 for (x = 0; x < 4; x++)
3230 dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3237 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3238 ptrdiff_t sstride, int h, int mx, int my)
3241 int c = 8 - my, d = my;
3249 dst[0] = (c * src[0] + d * src[ sstride] + 4) >> 3;
3250 dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3251 dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3252 dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3255 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
3256 "li %[tmp0], 0x03 \n\t"
3257 "mtc1 %[tmp0], %[ftmp4] \n\t"
3258 "pshufh %[c], %[c], %[ftmp0] \n\t"
3259 "pshufh %[d], %[d], %[ftmp0] \n\t"
3262 PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3264 "addiu %[h], %[h], -0x01 \n\t"
3265 PTR_ADDU "%[src], %[src], %[sstride] \n\t"
3266 PTR_ADDU "%[dst], %[dst], %[dstride] \n\t"
3267 "bnez %[h], 1b \n\t"
3268 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
3269 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
3270 [ftmp4]"=&f"(ftmp[4]),
3271 [tmp0]"=&r"(tmp[0]),
3276 [dst]"+&r"(dst), [src]"+&r"(src),
3277 [c]"+&f"(c), [d]"+&f"(d)
3278 : [sstride]"r"((mips_reg)sstride),
3279 [dstride]"r"((mips_reg)dstride),
3280 [ff_pw_4]"f"(ff_pw_4)
3284 int c = 8 - my, d = my;
3287 for (y = 0; y < h; y++) {
3288 for (x = 0; x < 4; x++)
3289 dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3296 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3297 ptrdiff_t sstride, int h, int mx, int my)
3300 DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3301 uint8_t *tmp = tmp_array;
3303 ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3304 ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3306 int a = 8 - mx, b = mx;
3307 int c = 8 - my, d = my;
3309 uint8_t tmp_array[36];
3310 uint8_t *tmp = tmp_array;
3312 for (y = 0; y < h + 1; y++) {
3313 for (x = 0; x < 4; x++)
3314 tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3321 for (y = 0; y < h; y++) {
3322 for (x = 0; x < 4; x++)
3323 dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;