2 * Loongson SIMD optimized h264qpel
4 * Copyright (c) 2015 Loongson Technology Corporation Limited
5 * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "h264dsp_mips.h"
25 #include "libavcodec/bit_depth_template.c"
27 static inline void copy_block4_mmi(uint8_t *dst, const uint8_t *src,
28 int dstStride, int srcStride, int h)
32 "gslwlc1 $f2, 3(%[src]) \r\n"
33 "gslwrc1 $f2, 0(%[src]) \r\n"
34 "gsswlc1 $f2, 3(%[dst]) \r\n"
35 "gsswrc1 $f2, 0(%[dst]) \r\n"
36 "dadd %[src], %[src], %[srcStride] \r\n"
37 "dadd %[dst], %[dst], %[dstStride] \r\n"
38 "daddi %[h], %[h], -1 \r\n"
40 : [dst]"+&r"(dst),[src]"+&r"(src)
41 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h)
46 static inline void copy_block8_mmi(uint8_t *dst, const uint8_t *src,
47 int dstStride, int srcStride, int h)
51 "gsldlc1 $f2, 7(%[src]) \r\n"
52 "gsldrc1 $f2, 0(%[src]) \r\n"
53 "gssdlc1 $f2, 7(%[dst]) \r\n"
54 "gssdrc1 $f2, 0(%[dst]) \r\n"
55 "dadd %[src], %[src], %[srcStride] \r\n"
56 "dadd %[dst], %[dst], %[dstStride] \r\n"
57 "daddi %[h], %[h], -1 \r\n"
59 : [dst]"+&r"(dst),[src]"+&r"(src)
60 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h)
65 static inline void copy_block16_mmi(uint8_t *dst, const uint8_t *src,
66 int dstStride, int srcStride, int h)
70 "gsldlc1 $f2, 7(%[src]) \r\n"
71 "gsldrc1 $f2, 0(%[src]) \r\n"
72 "gsldlc1 $f4, 15(%[src]) \r\n"
73 "gsldrc1 $f4, 8(%[src]) \r\n"
74 "gssdlc1 $f2, 7(%[dst]) \r\n"
75 "gssdrc1 $f2, 0(%[dst]) \r\n"
76 "gssdlc1 $f4, 15(%[dst]) \r\n"
77 "gssdrc1 $f4, 8(%[dst]) \r\n"
78 "dadd %[src], %[src], %[srcStride] \r\n"
79 "dadd %[dst], %[dst], %[dstStride] \r\n"
80 "daddi %[h], %[h], -1 \r\n"
82 : [dst]"+&r"(dst),[src]"+&r"(src)
83 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),[h]"r"(h)
88 #define op_put(a, b) a = b
89 #define op_avg(a, b) a = rnd_avg_pixel4(a, b)
90 static inline void put_pixels4_mmi(uint8_t *block, const uint8_t *pixels,
91 ptrdiff_t line_size, int h)
95 "gslwlc1 $f2, 3(%[pixels]) \r\n"
96 "gslwrc1 $f2, 0(%[pixels]) \r\n"
97 "gsswlc1 $f2, 3(%[block]) \r\n"
98 "gsswrc1 $f2, 0(%[block]) \r\n"
99 "dadd %[pixels], %[pixels], %[line_size]\r\n"
100 "dadd %[block], %[block], %[line_size] \r\n"
101 "daddi %[h], %[h], -1 \r\n"
103 : [block]"+&r"(block),[pixels]"+&r"(pixels)
104 : [line_size]"r"(line_size),[h]"r"(h)
109 static inline void put_pixels8_mmi(uint8_t *block, const uint8_t *pixels,
110 ptrdiff_t line_size, int h)
114 "gsldlc1 $f2, 7(%[pixels]) \r\n"
115 "gsldrc1 $f2, 0(%[pixels]) \r\n"
116 "gssdlc1 $f2, 7(%[block]) \r\n"
117 "gssdrc1 $f2, 0(%[block]) \r\n"
118 "dadd %[pixels], %[pixels], %[line_size]\r\n"
119 "dadd %[block], %[block], %[line_size] \r\n"
120 "daddi %[h], %[h], -1 \r\n"
122 : [block]"+&r"(block),[pixels]"+&r"(pixels)
123 : [line_size]"r"(line_size),[h]"r"(h)
128 static inline void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels,
129 ptrdiff_t line_size, int h)
133 "gsldlc1 $f2, 7(%[pixels]) \r\n"
134 "gsldrc1 $f2, 0(%[pixels]) \r\n"
135 "gsldlc1 $f4, 15(%[pixels]) \r\n"
136 "gsldrc1 $f4, 8(%[pixels]) \r\n"
137 "gssdlc1 $f2, 7(%[block]) \r\n"
138 "gssdrc1 $f2, 0(%[block]) \r\n"
139 "gssdlc1 $f4, 15(%[block]) \r\n"
140 "gssdrc1 $f4, 8(%[block]) \r\n"
141 "dadd %[pixels], %[pixels], %[line_size]\r\n"
142 "dadd %[block], %[block], %[line_size] \r\n"
143 "daddi %[h], %[h], -1 \r\n"
145 : [block]"+&r"(block),[pixels]"+&r"(pixels)
146 : [line_size]"r"(line_size),[h]"r"(h)
151 static inline void avg_pixels4_mmi(uint8_t *block, const uint8_t *pixels,
152 ptrdiff_t line_size, int h)
156 "gslwlc1 $f2, 3(%[pixels]) \r\n"
157 "gslwrc1 $f2, 0(%[pixels]) \r\n"
158 "gslwlc1 $f4, 3(%[block]) \r\n"
159 "gslwrc1 $f4, 0(%[block]) \r\n"
160 "pavgb $f2, $f2, $f4 \r\n"
161 "gsswlc1 $f2, 3(%[block]) \r\n"
162 "gsswrc1 $f2, 0(%[block]) \r\n"
163 "dadd %[pixels], %[pixels], %[line_size]\r\n"
164 "dadd %[block], %[block], %[line_size] \r\n"
165 "daddi %[h], %[h], -1 \r\n"
167 : [block]"+&r"(block),[pixels]"+&r"(pixels)
168 : [line_size]"r"(line_size),[h]"r"(h)
173 static inline void avg_pixels8_mmi(uint8_t *block, const uint8_t *pixels,
174 ptrdiff_t line_size, int h)
178 "gsldlc1 $f2, 7(%[block]) \r\n"
179 "gsldrc1 $f2, 0(%[block]) \r\n"
180 "gsldlc1 $f4, 7(%[pixels]) \r\n"
181 "gsldrc1 $f4, 0(%[pixels]) \r\n"
182 "pavgb $f2, $f2, $f4 \r\n"
183 "gssdlc1 $f2, 7(%[block]) \r\n"
184 "gssdrc1 $f2, 0(%[block]) \r\n"
185 "dadd %[pixels], %[pixels], %[line_size]\r\n"
186 "dadd %[block], %[block], %[line_size] \r\n"
187 "daddi %[h], %[h], -1 \r\n"
189 : [block]"+&r"(block),[pixels]"+&r"(pixels)
190 : [line_size]"r"(line_size),[h]"r"(h)
195 static inline void avg_pixels16_mmi(uint8_t *block, const uint8_t *pixels,
196 ptrdiff_t line_size, int h)
200 "gsldlc1 $f2, 7(%[block]) \r\n"
201 "gsldrc1 $f2, 0(%[block]) \r\n"
202 "gsldlc1 $f4, 15(%[block]) \r\n"
203 "gsldrc1 $f4, 8(%[block]) \r\n"
204 "gsldlc1 $f6, 7(%[pixels]) \r\n"
205 "gsldrc1 $f6, 0(%[pixels]) \r\n"
206 "gsldlc1 $f8, 15(%[pixels]) \r\n"
207 "gsldrc1 $f8, 8(%[pixels]) \r\n"
208 "pavgb $f2, $f2, $f6 \r\n"
209 "pavgb $f4, $f4, $f8 \r\n"
210 "gssdlc1 $f2, 7(%[block]) \r\n"
211 "gssdrc1 $f2, 0(%[block]) \r\n"
212 "gssdlc1 $f4, 15(%[block]) \r\n"
213 "gssdrc1 $f4, 8(%[block]) \r\n"
214 "dadd %[pixels], %[pixels], %[line_size]\r\n"
215 "dadd %[block], %[block], %[line_size] \r\n"
216 "daddi %[h], %[h], -1 \r\n"
218 : [block]"+&r"(block),[pixels]"+&r"(pixels)
219 : [line_size]"r"(line_size),[h]"r"(h)
220 : "$f2","$f4","$f6","$f8"
224 static inline void put_pixels4_l2_mmi(uint8_t *dst, const uint8_t *src1,
225 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
229 for (i = 0; i < h; i++) {
231 a = AV_RN4P(&src1[i * src_stride1]);
232 b = AV_RN4P(&src2[i * src_stride2]);
233 op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
237 static inline void put_pixels8_l2_mmi(uint8_t *dst, const uint8_t *src1,
238 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
242 for (i = 0; i < h; i++) {
244 a = AV_RN4P(&src1[i * src_stride1]);
245 b = AV_RN4P(&src2[i * src_stride2]);
246 op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
247 a = AV_RN4P(&src1[i * src_stride1 + 4]);
248 b = AV_RN4P(&src2[i * src_stride2 + 4]);
249 op_put(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
253 static inline void put_pixels16_l2_mmi(uint8_t *dst, const uint8_t *src1,
254 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
258 for (i = 0; i < h; i++) {
260 a = AV_RN4P(&src1[i * src_stride1]);
261 b = AV_RN4P(&src2[i * src_stride2]);
262 op_put(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
263 a = AV_RN4P(&src1[i * src_stride1 + 4]);
264 b = AV_RN4P(&src2[i * src_stride2 + 4]);
265 op_put(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
266 a = AV_RN4P(&src1[i * src_stride1 + 8]);
267 b = AV_RN4P(&src2[i * src_stride2 + 8]);
268 op_put(*((pixel4 *) &dst[i * dst_stride + 8]), rnd_avg_pixel4(a, b));
269 a = AV_RN4P(&src1[i * src_stride1 + 12]);
270 b = AV_RN4P(&src2[i * src_stride2 + 12]);
271 op_put(*((pixel4 *) &dst[i * dst_stride + 12]), rnd_avg_pixel4(a, b));
275 static inline void avg_pixels4_l2_mmi(uint8_t *dst, const uint8_t *src1,
276 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
280 for (i = 0; i < h; i++) {
282 a = AV_RN4P(&src1[i * src_stride1]);
283 b = AV_RN4P(&src2[i * src_stride2]);
284 op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
288 static inline void avg_pixels8_l2_mmi(uint8_t *dst, const uint8_t *src1,
289 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
293 for (i = 0; i < h; i++) {
295 a = AV_RN4P(&src1[i * src_stride1]);
296 b = AV_RN4P(&src2[i * src_stride2]);
297 op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
298 a = AV_RN4P(&src1[i * src_stride1 + 4]);
299 b = AV_RN4P(&src2[i * src_stride2 + 4]);
300 op_avg(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
304 static inline void avg_pixels16_l2_mmi(uint8_t *dst, const uint8_t *src1,
305 const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
309 for (i = 0; i < h; i++) {
311 a = AV_RN4P(&src1[i * src_stride1]);
312 b = AV_RN4P(&src2[i * src_stride2]);
313 op_avg(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));
314 a = AV_RN4P(&src1[i * src_stride1 + 4]);
315 b = AV_RN4P(&src2[i * src_stride2 + 4]);
316 op_avg(*((pixel4 *) &dst[i * dst_stride + 4]), rnd_avg_pixel4(a, b));
317 a = AV_RN4P(&src1[i * src_stride1 + 8]);
318 b = AV_RN4P(&src2[i * src_stride2 + 8]);
319 op_avg(*((pixel4 *) &dst[i * dst_stride + 8]), rnd_avg_pixel4(a, b));
320 a = AV_RN4P(&src1[i * src_stride1 + 12]);
321 b = AV_RN4P(&src2[i * src_stride2 + 12]);
322 op_avg(*((pixel4 *) &dst[i * dst_stride + 12]), rnd_avg_pixel4(a, b));
329 #define op2_avg(a, b) a = (((a)+CLIP(((b) + 512)>>10)+1)>>1)
330 #define op2_put(a, b) a = CLIP(((b) + 512)>>10)
331 static void put_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
332 int dstStride, int srcStride)
335 "xor $f0, $f0, $f0 \r\n"
338 "gslwlc1 $f2, 1(%[src]) \r\n"
339 "gslwrc1 $f2, -2(%[src]) \r\n"
340 "gslwlc1 $f4, 2(%[src]) \r\n"
341 "gslwrc1 $f4, -1(%[src]) \r\n"
342 "gslwlc1 $f6, 3(%[src]) \r\n"
343 "gslwrc1 $f6, 0(%[src]) \r\n"
344 "gslwlc1 $f8, 4(%[src]) \r\n"
345 "gslwrc1 $f8, 1(%[src]) \r\n"
346 "gslwlc1 $f10, 5(%[src]) \r\n"
347 "gslwrc1 $f10, 2(%[src]) \r\n"
348 "gslwlc1 $f12, 6(%[src]) \r\n"
349 "gslwrc1 $f12, 3(%[src]) \r\n"
350 "punpcklbh $f2, $f2, $f0 \r\n"
351 "punpcklbh $f4, $f4, $f0 \r\n"
352 "punpcklbh $f6, $f6, $f0 \r\n"
353 "punpcklbh $f8, $f8, $f0 \r\n"
354 "punpcklbh $f10, $f10, $f0 \r\n"
355 "punpcklbh $f12, $f12, $f0 \r\n"
356 "paddsh $f14, $f6, $f8 \r\n"
357 "paddsh $f16, $f4, $f10 \r\n"
358 "paddsh $f18, $f2, $f12 \r\n"
359 "pmullh $f14, $f14, %[ff_pw_20] \r\n"
360 "pmullh $f16, $f16, %[ff_pw_5] \r\n"
361 "psubsh $f14, $f14, $f16 \r\n"
362 "paddsh $f18, $f14, $f18 \r\n"
363 "paddsh $f18, $f18, %[ff_pw_16] \r\n"
364 "psrah $f18, $f18, %[ff_pw_5] \r\n"
365 "packushb $f18, $f18, $f0 \r\n"
366 "gsswlc1 $f18, 3(%[dst]) \r\n"
367 "gsswrc1 $f18, 0(%[dst]) \r\n"
368 "dadd %[dst], %[dst], %[dstStride] \r\n"
369 "dadd %[src], %[src], %[srcStride] \r\n"
370 "daddi $8, $8, -1 \r\n"
372 : [dst]"+&r"(dst),[src]"+&r"(src)
373 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
374 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
375 : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
380 static void put_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
381 int dstStride, int srcStride)
384 "xor $f0, $f0, $f0 \r\n"
387 "gsldlc1 $f2, 5(%[src]) \r\n"
388 "gsldrc1 $f2, -2(%[src]) \r\n"
389 "gsldlc1 $f4, 6(%[src]) \r\n"
390 "gsldrc1 $f4, -1(%[src]) \r\n"
391 "gsldlc1 $f6, 7(%[src]) \r\n"
392 "gsldrc1 $f6, 0(%[src]) \r\n"
393 "gsldlc1 $f8, 8(%[src]) \r\n"
394 "gsldrc1 $f8, 1(%[src]) \r\n"
395 "gsldlc1 $f10, 9(%[src]) \r\n"
396 "gsldrc1 $f10, 2(%[src]) \r\n"
397 "gsldlc1 $f12, 10(%[src]) \r\n"
398 "gsldrc1 $f12, 3(%[src]) \r\n"
399 "punpcklbh $f14, $f6, $f0 \r\n"
400 "punpckhbh $f16, $f6, $f0 \r\n"
401 "punpcklbh $f18, $f8, $f0 \r\n"
402 "punpckhbh $f20, $f8, $f0 \r\n"
403 "paddsh $f6, $f14, $f18 \r\n"
404 "paddsh $f8, $f16, $f20 \r\n"
405 "pmullh $f6, $f6, %[ff_pw_20] \r\n"
406 "pmullh $f8, $f8, %[ff_pw_20] \r\n"
407 "punpcklbh $f14, $f4, $f0 \r\n"
408 "punpckhbh $f16, $f4, $f0 \r\n"
409 "punpcklbh $f18, $f10, $f0 \r\n"
410 "punpckhbh $f20, $f10, $f0 \r\n"
411 "paddsh $f4, $f14, $f18 \r\n"
412 "paddsh $f10, $f16, $f20 \r\n"
413 "pmullh $f4, $f4, %[ff_pw_5] \r\n"
414 "pmullh $f10, $f10, %[ff_pw_5] \r\n"
415 "punpcklbh $f14, $f2, $f0 \r\n"
416 "punpckhbh $f16, $f2, $f0 \r\n"
417 "punpcklbh $f18, $f12, $f0 \r\n"
418 "punpckhbh $f20, $f12, $f0 \r\n"
419 "paddsh $f2, $f14, $f18 \r\n"
420 "paddsh $f12, $f16, $f20 \r\n"
421 "psubsh $f6, $f6, $f4 \r\n"
422 "psubsh $f8, $f8, $f10 \r\n"
423 "paddsh $f6, $f6, $f2 \r\n"
424 "paddsh $f8, $f8, $f12 \r\n"
425 "paddsh $f6, $f6, %[ff_pw_16] \r\n"
426 "paddsh $f8, $f8, %[ff_pw_16] \r\n"
427 "psrah $f6, $f6, %[ff_pw_5] \r\n"
428 "psrah $f8, $f8, %[ff_pw_5] \r\n"
429 "packushb $f18, $f6, $f8 \r\n"
430 "sdc1 $f18, 0(%[dst]) \r\n"
431 "dadd %[dst], %[dst], %[dstStride] \r\n"
432 "dadd %[src], %[src], %[srcStride] \r\n"
433 "daddi $8, $8, -1 \r\n"
435 : [dst]"+&r"(dst),[src]"+&r"(src)
436 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
437 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
438 : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
443 static void put_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
444 int dstStride, int srcStride)
446 put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
447 put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
450 put_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
451 put_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
454 static void avg_h264_qpel4_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
455 int dstStride, int srcStride)
458 "xor $f0, $f0, $f0 \r\n"
461 "gslwlc1 $f2, 1(%[src]) \r\n"
462 "gslwrc1 $f2, -2(%[src]) \r\n"
463 "gslwlc1 $f4, 2(%[src]) \r\n"
464 "gslwrc1 $f4, -1(%[src]) \r\n"
465 "gslwlc1 $f6, 3(%[src]) \r\n"
466 "gslwrc1 $f6, 0(%[src]) \r\n"
467 "gslwlc1 $f8, 4(%[src]) \r\n"
468 "gslwrc1 $f8, 1(%[src]) \r\n"
469 "gslwlc1 $f10, 5(%[src]) \r\n"
470 "gslwrc1 $f10, 2(%[src]) \r\n"
471 "gslwlc1 $f12, 6(%[src]) \r\n"
472 "gslwrc1 $f12, 3(%[src]) \r\n"
473 "punpcklbh $f2, $f2, $f0 \r\n"
474 "punpcklbh $f4, $f4, $f0 \r\n"
475 "punpcklbh $f6, $f6, $f0 \r\n"
476 "punpcklbh $f8, $f8, $f0 \r\n"
477 "punpcklbh $f10, $f10, $f0 \r\n"
478 "punpcklbh $f12, $f12, $f0 \r\n"
479 "paddsh $f14, $f6, $f8 \r\n"
480 "paddsh $f16, $f4, $f10 \r\n"
481 "paddsh $f18, $f2, $f12 \r\n"
482 "pmullh $f14, $f14, %[ff_pw_20] \r\n"
483 "pmullh $f16, $f16, %[ff_pw_5] \r\n"
484 "psubsh $f14, $f14, $f16 \r\n"
485 "paddsh $f18, $f14, $f18 \r\n"
486 "paddsh $f18, $f18, %[ff_pw_16] \r\n"
487 "psrah $f18, $f18, %[ff_pw_5] \r\n"
488 "packushb $f18, $f18, $f0 \r\n"
489 "lwc1 $f20, 0(%[dst]) \r\n"
490 "pavgb $f18, $f18, $f20 \r\n"
491 "gsswlc1 $f18, 3(%[dst]) \r\n"
492 "gsswrc1 $f18, 0(%[dst]) \r\n"
493 "dadd %[dst], %[dst], %[dstStride] \r\n"
494 "dadd %[src], %[src], %[srcStride] \r\n"
495 "daddi $8, $8, -1 \r\n"
497 : [dst]"+&r"(dst),[src]"+&r"(src)
498 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
499 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
500 : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
505 static void avg_h264_qpel8_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
506 int dstStride, int srcStride)
509 "xor $f0, $f0, $f0 \r\n"
512 "gsldlc1 $f2, 5(%[src]) \r\n"
513 "gsldrc1 $f2, -2(%[src]) \r\n"
514 "gsldlc1 $f4, 6(%[src]) \r\n"
515 "gsldrc1 $f4, -1(%[src]) \r\n"
516 "gsldlc1 $f6, 7(%[src]) \r\n"
517 "gsldrc1 $f6, 0(%[src]) \r\n"
518 "gsldlc1 $f8, 8(%[src]) \r\n"
519 "gsldrc1 $f8, 1(%[src]) \r\n"
520 "gsldlc1 $f10, 9(%[src]) \r\n"
521 "gsldrc1 $f10, 2(%[src]) \r\n"
522 "gsldlc1 $f12, 10(%[src]) \r\n"
523 "gsldrc1 $f12, 3(%[src]) \r\n"
524 "punpcklbh $f14, $f6, $f0 \r\n"
525 "punpckhbh $f16, $f6, $f0 \r\n"
526 "punpcklbh $f18, $f8, $f0 \r\n"
527 "punpckhbh $f20, $f8, $f0 \r\n"
528 "paddsh $f6, $f14, $f18 \r\n"
529 "paddsh $f8, $f16, $f20 \r\n"
530 "pmullh $f6, $f6, %[ff_pw_20] \r\n"
531 "pmullh $f8, $f8, %[ff_pw_20] \r\n"
532 "punpcklbh $f14, $f4, $f0 \r\n"
533 "punpckhbh $f16, $f4, $f0 \r\n"
534 "punpcklbh $f18, $f10, $f0 \r\n"
535 "punpckhbh $f20, $f10, $f0 \r\n"
536 "paddsh $f4, $f14, $f18 \r\n"
537 "paddsh $f10, $f16, $f20 \r\n"
538 "pmullh $f4, $f4, %[ff_pw_5] \r\n"
539 "pmullh $f10, $f10, %[ff_pw_5] \r\n"
540 "punpcklbh $f14, $f2, $f0 \r\n"
541 "punpckhbh $f16, $f2, $f0 \r\n"
542 "punpcklbh $f18, $f12, $f0 \r\n"
543 "punpckhbh $f20, $f12, $f0 \r\n"
544 "paddsh $f2, $f14, $f18 \r\n"
545 "paddsh $f12, $f16, $f20 \r\n"
546 "psubsh $f6, $f6, $f4 \r\n"
547 "psubsh $f8, $f8, $f10 \r\n"
548 "paddsh $f6, $f6, $f2 \r\n"
549 "paddsh $f8, $f8, $f12 \r\n"
550 "paddsh $f6, $f6, %[ff_pw_16] \r\n"
551 "paddsh $f8, $f8, %[ff_pw_16] \r\n"
552 "psrah $f6, $f6, %[ff_pw_5] \r\n"
553 "psrah $f8, $f8, %[ff_pw_5] \r\n"
554 "packushb $f18, $f6, $f8 \r\n"
555 "ldc1 $f20, 0(%[dst]) \r\n"
556 "pavgb $f18, $f18, $f20 \r\n"
557 "sdc1 $f18, 0(%[dst]) \r\n"
558 "dadd %[dst], %[dst], %[dstStride] \r\n"
559 "dadd %[src], %[src], %[srcStride] \r\n"
560 "daddi $8, $8, -1 \r\n"
562 : [dst]"+&r"(dst),[src]"+&r"(src)
563 : [dstStride]"r"(dstStride),[srcStride]"r"(srcStride),
564 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5),[ff_pw_16]"f"(ff_pw_16)
565 : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16",
570 static void avg_h264_qpel16_h_lowpass_mmi(uint8_t *dst, const uint8_t *src,
571 int dstStride, int srcStride)
573 avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
574 avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
577 avg_h264_qpel8_h_lowpass_mmi(dst, src, dstStride, srcStride);
578 avg_h264_qpel8_h_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
581 static void put_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
582 int dstStride, int srcStride)
585 "xor $f0, $f0, $f0 \r\n"
586 "gslwlc1 $f2, 3(%[srcB]) \r\n"
587 "gslwrc1 $f2, 0(%[srcB]) \r\n"
588 "gslwlc1 $f4, 3(%[srcA]) \r\n"
589 "gslwrc1 $f4, 0(%[srcA]) \r\n"
590 "gslwlc1 $f6, 3(%[src0]) \r\n"
591 "gslwrc1 $f6, 0(%[src0]) \r\n"
592 "gslwlc1 $f8, 3(%[src1]) \r\n"
593 "gslwrc1 $f8, 0(%[src1]) \r\n"
594 "gslwlc1 $f10, 3(%[src2]) \r\n"
595 "gslwrc1 $f10, 0(%[src2]) \r\n"
596 "gslwlc1 $f12, 3(%[src3]) \r\n"
597 "gslwrc1 $f12, 0(%[src3]) \r\n"
598 "gslwlc1 $f14, 3(%[src4]) \r\n"
599 "gslwrc1 $f14, 0(%[src4]) \r\n"
600 "gslwlc1 $f16, 3(%[src5]) \r\n"
601 "gslwrc1 $f16, 0(%[src5]) \r\n"
602 "gslwlc1 $f18, 3(%[src6]) \r\n"
603 "gslwrc1 $f18, 0(%[src6]) \r\n"
604 "punpcklbh $f2, $f2, $f0 \r\n"
605 "punpcklbh $f4, $f4, $f0 \r\n"
606 "punpcklbh $f6, $f6, $f0 \r\n"
607 "punpcklbh $f8, $f8, $f0 \r\n"
608 "punpcklbh $f10, $f10, $f0 \r\n"
609 "punpcklbh $f12, $f12, $f0 \r\n"
610 "punpcklbh $f14, $f14, $f0 \r\n"
611 "punpcklbh $f16, $f16, $f0 \r\n"
612 "punpcklbh $f18, $f18, $f0 \r\n"
613 "paddsh $f20, $f6, $f8 \r\n"
614 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
615 "paddsh $f22, $f4, $f10 \r\n"
616 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
617 "psubsh $f24, $f20, $f22 \r\n"
618 "paddsh $f24, $f24, $f2 \r\n"
619 "paddsh $f24, $f24, $f12 \r\n"
620 "paddsh $f20, $f8, $f10 \r\n"
621 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
622 "paddsh $f22, $f6, $f12 \r\n"
623 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
624 "psubsh $f26, $f20, $f22 \r\n"
625 "paddsh $f26, $f26, $f4 \r\n"
626 "paddsh $f26, $f26, $f14 \r\n"
627 "paddsh $f20, $f10, $f12 \r\n"
628 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
629 "paddsh $f22, $f8, $f14 \r\n"
630 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
631 "psubsh $f28, $f20, $f22 \r\n"
632 "paddsh $f28, $f28, $f6 \r\n"
633 "paddsh $f28, $f28, $f16 \r\n"
634 "paddsh $f20, $f12, $f14 \r\n"
635 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
636 "paddsh $f22, $f10, $f16 \r\n"
637 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
638 "psubsh $f30, $f20, $f22 \r\n"
639 "paddsh $f30, $f30, $f8 \r\n"
640 "paddsh $f30, $f30, $f18 \r\n"
641 "paddsh $f24, $f24, %[ff_pw_16] \r\n"
642 "paddsh $f26, $f26, %[ff_pw_16] \r\n"
643 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
644 "paddsh $f30, $f30, %[ff_pw_16] \r\n"
645 "psrah $f24, $f24, %[ff_pw_5] \r\n"
646 "psrah $f26, $f26, %[ff_pw_5] \r\n"
647 "psrah $f28, $f28, %[ff_pw_5] \r\n"
648 "psrah $f30, $f30, %[ff_pw_5] \r\n"
649 "packushb $f24, $f24, $f0 \r\n"
650 "packushb $f26, $f26, $f0 \r\n"
651 "packushb $f28, $f28, $f0 \r\n"
652 "packushb $f30, $f30, $f0 \r\n"
653 "swc1 $f24, 0(%[dst0]) \r\n"
654 "swc1 $f26, 0(%[dst1]) \r\n"
655 "swc1 $f28, 0(%[dst2]) \r\n"
656 "swc1 $f30, 0(%[dst3]) \r\n"
657 ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride),
658 [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride),
659 [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride),
660 [src0]"r"(src), [src1]"r"(src+srcStride),
661 [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride),
662 [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride),
663 [src6]"r"(src+6*srcStride), [ff_pw_20]"f"(ff_pw_20),
664 [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16)
665 : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18",
666 "$f20","$f22","$f24","$f26","$f28","$f30"
670 static void put_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
671 int dstStride, int srcStride)
674 "xor $f0, $f0, $f0 \r\n"
675 "gsldlc1 $f2, 7(%[srcB]) \r\n"
676 "gsldrc1 $f2, 0(%[srcB]) \r\n"
677 "gsldlc1 $f4, 7(%[srcA]) \r\n"
678 "gsldrc1 $f4, 0(%[srcA]) \r\n"
679 "gsldlc1 $f6, 7(%[src0]) \r\n"
680 "gsldrc1 $f6, 0(%[src0]) \r\n"
681 "gsldlc1 $f8, 7(%[src1]) \r\n"
682 "gsldrc1 $f8, 0(%[src1]) \r\n"
683 "gsldlc1 $f10, 7(%[src2]) \r\n"
684 "gsldrc1 $f10, 0(%[src2]) \r\n"
685 "gsldlc1 $f12, 7(%[src3]) \r\n"
686 "gsldrc1 $f12, 0(%[src3]) \r\n"
687 "gsldlc1 $f14, 7(%[src4]) \r\n"
688 "gsldrc1 $f14, 0(%[src4]) \r\n"
689 "gsldlc1 $f16, 7(%[src5]) \r\n"
690 "gsldrc1 $f16, 0(%[src5]) \r\n"
691 "gsldlc1 $f18, 7(%[src6]) \r\n"
692 "gsldrc1 $f18, 0(%[src6]) \r\n"
693 "gsldlc1 $f20, 7(%[src7]) \r\n"
694 "gsldrc1 $f20, 0(%[src7]) \r\n"
695 "gsldlc1 $f22, 7(%[src8]) \r\n"
696 "gsldrc1 $f22, 0(%[src8]) \r\n"
697 "gsldlc1 $f24, 7(%[src9]) \r\n"
698 "gsldrc1 $f24, 0(%[src9]) \r\n"
699 "gsldlc1 $f26, 7(%[src10]) \r\n"
700 "gsldrc1 $f26, 0(%[src10]) \r\n"
701 "punpcklbh $f1, $f2, $f0 \r\n"
702 "punpckhbh $f2, $f2, $f0 \r\n"
703 "punpcklbh $f3, $f4, $f0 \r\n"
704 "punpckhbh $f4, $f4, $f0 \r\n"
705 "punpcklbh $f5, $f6, $f0 \r\n"
706 "punpckhbh $f6, $f6, $f0 \r\n"
707 "punpcklbh $f7, $f8, $f0 \r\n"
708 "punpckhbh $f8, $f8, $f0 \r\n"
709 "punpcklbh $f9, $f10, $f0 \r\n"
710 "punpckhbh $f10, $f10, $f0 \r\n"
711 "punpcklbh $f11, $f12, $f0 \r\n"
712 "punpckhbh $f12, $f12, $f0 \r\n"
713 "punpcklbh $f13, $f14, $f0 \r\n"
714 "punpckhbh $f14, $f14, $f0 \r\n"
715 "punpcklbh $f15, $f16, $f0 \r\n"
716 "punpckhbh $f16, $f16, $f0 \r\n"
717 "punpcklbh $f17, $f18, $f0 \r\n"
718 "punpckhbh $f18, $f18, $f0 \r\n"
719 "punpcklbh $f19, $f20, $f0 \r\n"
720 "punpckhbh $f20, $f20, $f0 \r\n"
721 "punpcklbh $f21, $f22, $f0 \r\n"
722 "punpckhbh $f22, $f22, $f0 \r\n"
723 "punpcklbh $f23, $f24, $f0 \r\n"
724 "punpckhbh $f24, $f24, $f0 \r\n"
725 "punpcklbh $f25, $f26, $f0 \r\n"
726 "punpckhbh $f26, $f26, $f0 \r\n"
727 "paddsh $f27, $f5, $f7 \r\n"
728 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
729 "paddsh $f28, $f6, $f8 \r\n"//src0+src1
730 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
731 "psubsh $f27, $f27, $f3 \r\n"
732 "psubsh $f28, $f28, $f4 \r\n"
733 "psubsh $f27, $f27, $f9 \r\n"
734 "psubsh $f28, $f28, $f10 \r\n"
735 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
736 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
737 "paddsh $f27, $f27, $f1 \r\n"
738 "paddsh $f28, $f28, $f2 \r\n"
739 "paddsh $f27, $f27, $f11 \r\n"
740 "paddsh $f28, $f28, $f12 \r\n"
741 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
742 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
743 "psrah $f27, $f27, %[ff_pw_5] \r\n"
744 "psrah $f28, $f28, %[ff_pw_5] \r\n"
745 "packushb $f27, $f27, $f0 \r\n"
746 "packushb $f28, $f28, $f0 \r\n"
747 "punpcklwd $f2, $f27, $f28 \r\n"
748 "sdc1 $f2, 0(%[dst0]) \r\n"
749 "paddsh $f27, $f7, $f9 \r\n"
750 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
751 "paddsh $f28, $f8, $f10 \r\n"//src1+src2
752 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
753 "psubsh $f27, $f27, $f5 \r\n"
754 "psubsh $f28, $f28, $f6 \r\n"
755 "psubsh $f27, $f27, $f11 \r\n"
756 "psubsh $f28, $f28, $f12 \r\n"
757 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
758 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
759 "paddsh $f27, $f27, $f3 \r\n"
760 "paddsh $f28, $f28, $f4 \r\n"
761 "paddsh $f27, $f27, $f13 \r\n"
762 "paddsh $f28, $f28, $f14 \r\n"
763 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
764 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
765 "psrah $f27, $f27, %[ff_pw_5] \r\n"
766 "psrah $f28, $f28, %[ff_pw_5] \r\n"
767 "packushb $f27, $f27, $f0 \r\n"
768 "packushb $f28, $f28, $f0 \r\n"
769 "punpcklwd $f4, $f27, $f28 \r\n"
770 "sdc1 $f4, 0(%[dst1]) \r\n"
771 "paddsh $f27, $f9, $f11 \r\n"
772 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
773 "paddsh $f28, $f10, $f12 \r\n"//src2+src3
774 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
775 "psubsh $f27, $f27, $f7 \r\n"
776 "psubsh $f28, $f28, $f8 \r\n"
777 "psubsh $f27, $f27, $f13 \r\n"
778 "psubsh $f28, $f28, $f14 \r\n"
779 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
780 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
781 "paddsh $f27, $f27, $f5 \r\n"
782 "paddsh $f28, $f28, $f6 \r\n"
783 "paddsh $f27, $f27, $f15 \r\n"
784 "paddsh $f28, $f28, $f16 \r\n"
785 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
786 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
787 "psrah $f27, $f27, %[ff_pw_5] \r\n"
788 "psrah $f28, $f28, %[ff_pw_5] \r\n"
789 "packushb $f27, $f27, $f0 \r\n"
790 "packushb $f28, $f28, $f0 \r\n"
791 "punpcklwd $f6, $f27, $f28 \r\n"
792 "sdc1 $f6, 0(%[dst2]) \r\n"
793 "paddsh $f27, $f11, $f13 \r\n"
794 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
795 "paddsh $f28, $f12, $f14 \r\n"//src3+src4
796 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
797 "psubsh $f27, $f27, $f9 \r\n"
798 "psubsh $f28, $f28, $f10 \r\n"
799 "psubsh $f27, $f27, $f15 \r\n"
800 "psubsh $f28, $f28, $f16 \r\n"
801 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
802 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
803 "paddsh $f27, $f27, $f7 \r\n"
804 "paddsh $f28, $f28, $f8 \r\n"
805 "paddsh $f27, $f27, $f17 \r\n"
806 "paddsh $f28, $f28, $f18 \r\n"
807 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
808 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
809 "psrah $f27, $f27, %[ff_pw_5] \r\n"
810 "psrah $f28, $f28, %[ff_pw_5] \r\n"
811 "packushb $f27, $f27, $f0 \r\n"
812 "packushb $f28, $f28, $f0 \r\n"
813 "punpcklwd $f8, $f27, $f28 \r\n"
814 "sdc1 $f8, 0(%[dst3]) \r\n"
815 "paddsh $f27, $f13, $f15 \r\n"
816 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
817 "paddsh $f28, $f14, $f16 \r\n"//src4+src5
818 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
819 "psubsh $f27, $f27, $f11 \r\n"
820 "psubsh $f28, $f28, $f12 \r\n"
821 "psubsh $f27, $f27, $f17 \r\n"
822 "psubsh $f28, $f28, $f18 \r\n"
823 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
824 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
825 "paddsh $f27, $f27, $f9 \r\n"
826 "paddsh $f28, $f28, $f10 \r\n"
827 "paddsh $f27, $f27, $f19 \r\n"
828 "paddsh $f28, $f28, $f20 \r\n"
829 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
830 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
831 "psrah $f27, $f27, %[ff_pw_5] \r\n"
832 "psrah $f28, $f28, %[ff_pw_5] \r\n"
833 "packushb $f27, $f27, $f0 \r\n"
834 "packushb $f28, $f28, $f0 \r\n"
835 "punpcklwd $f10, $f27, $f28 \r\n"
836 "sdc1 $f10, 0(%[dst4]) \r\n"
838 "paddsh $f27, $f15, $f17 \r\n"
839 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
840 "paddsh $f28, $f16, $f18 \r\n"//src5+src6
841 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
842 "psubsh $f27, $f27, $f13 \r\n"
843 "psubsh $f28, $f28, $f14 \r\n"
844 "psubsh $f27, $f27, $f19 \r\n"
845 "psubsh $f28, $f28, $f20 \r\n"
846 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
847 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
848 "paddsh $f27, $f27, $f11 \r\n"
849 "paddsh $f28, $f28, $f12 \r\n"
850 "paddsh $f27, $f27, $f21 \r\n"
851 "paddsh $f28, $f28, $f22 \r\n"
852 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
853 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
854 "psrah $f27, $f27, %[ff_pw_5] \r\n"
855 "psrah $f28, $f28, %[ff_pw_5] \r\n"
856 "packushb $f27, $f27, $f0 \r\n"
857 "packushb $f28, $f28, $f0 \r\n"
858 "punpcklwd $f12, $f27, $f28 \r\n"
859 "sdc1 $f12, 0(%[dst5]) \r\n"
860 "paddsh $f27, $f17, $f19 \r\n"
861 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
862 "paddsh $f28, $f18, $f20 \r\n"//src6+src7
863 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
864 "psubsh $f27, $f27, $f15 \r\n"
865 "psubsh $f28, $f28, $f16 \r\n"
866 "psubsh $f27, $f27, $f21 \r\n"
867 "psubsh $f28, $f28, $f22 \r\n"
868 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
869 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
870 "paddsh $f27, $f27, $f13 \r\n"
871 "paddsh $f28, $f28, $f14 \r\n"
872 "paddsh $f27, $f27, $f23 \r\n"
873 "paddsh $f28, $f28, $f24 \r\n"
874 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
875 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
876 "psrah $f27, $f27, %[ff_pw_5] \r\n"
877 "psrah $f28, $f28, %[ff_pw_5] \r\n"
878 "packushb $f27, $f27, $f0 \r\n"
879 "packushb $f28, $f28, $f0 \r\n"
880 "punpcklwd $f14, $f27, $f28 \r\n"
881 "sdc1 $f14, 0(%[dst6]) \r\n"
882 "paddsh $f27, $f19, $f21 \r\n"
883 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
884 "paddsh $f28, $f20, $f22 \r\n"//src7+src8
885 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
886 "psubsh $f27, $f27, $f17 \r\n"
887 "psubsh $f28, $f28, $f18 \r\n"
888 "psubsh $f27, $f27, $f23 \r\n"
889 "psubsh $f28, $f28, $f24 \r\n"
890 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
891 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
892 "paddsh $f27, $f27, $f15 \r\n"
893 "paddsh $f28, $f28, $f16 \r\n"
894 "paddsh $f27, $f27, $f25 \r\n"
895 "paddsh $f28, $f28, $f26 \r\n"
896 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
897 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
898 "psrah $f27, $f27, %[ff_pw_5] \r\n"
899 "psrah $f28, $f28, %[ff_pw_5] \r\n"
900 "packushb $f27, $f27, $f0 \r\n"
901 "packushb $f28, $f28, $f0 \r\n"
902 "punpcklwd $f16, $f27, $f28 \r\n"
903 "sdc1 $f16, 0(%[dst7]) \r\n"
904 ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride),
905 [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride),
906 [dst4]"r"(dst+4*dstStride), [dst5]"r"(dst+5*dstStride),
907 [dst6]"r"(dst+6*dstStride), [dst7]"r"(dst+7*dstStride),
908 [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride),
909 [src0]"r"(src), [src1]"r"(src+srcStride),
910 [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride),
911 [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride),
912 [src6]"r"(src+6*srcStride), [src7]"r"(src+7*srcStride),
913 [src8]"r"(src+8*srcStride), [src9]"r"(src+9*srcStride),
914 [src10]"r"(src+10*srcStride), [ff_pw_4]"f"(ff_pw_4),
915 [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16)
916 : "$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9","$f10",
917 "$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18","$f19",
918 "$f20","$f21","$f22","$f23","$f24","$f25","$f26","$f27","$f28"
922 static void put_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
923 int dstStride, int srcStride)
925 put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
926 put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
929 put_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
930 put_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
933 static void avg_h264_qpel4_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
934 int dstStride, int srcStride)
937 "xor $f0, $f0, $f0 \r\n"
938 "gslwlc1 $f2, 3(%[srcB]) \r\n"
939 "gslwrc1 $f2, 0(%[srcB]) \r\n"
940 "gslwlc1 $f4, 3(%[srcA]) \r\n"
941 "gslwrc1 $f4, 0(%[srcA]) \r\n"
942 "gslwlc1 $f6, 3(%[src0]) \r\n"
943 "gslwrc1 $f6, 0(%[src0]) \r\n"
944 "gslwlc1 $f8, 3(%[src1]) \r\n"
945 "gslwrc1 $f8, 0(%[src1]) \r\n"
946 "gslwlc1 $f10, 3(%[src2]) \r\n"
947 "gslwrc1 $f10, 0(%[src2]) \r\n"
948 "gslwlc1 $f12, 3(%[src3]) \r\n"
949 "gslwrc1 $f12, 0(%[src3]) \r\n"
950 "gslwlc1 $f14, 3(%[src4]) \r\n"
951 "gslwrc1 $f14, 0(%[src4]) \r\n"
952 "gslwlc1 $f16, 3(%[src5]) \r\n"
953 "gslwrc1 $f16, 0(%[src5]) \r\n"
954 "gslwlc1 $f18, 3(%[src6]) \r\n"
955 "gslwrc1 $f18, 0(%[src6]) \r\n"
956 "punpcklbh $f2, $f2, $f0 \r\n"
957 "punpcklbh $f4, $f4, $f0 \r\n"
958 "punpcklbh $f6, $f6, $f0 \r\n"
959 "punpcklbh $f8, $f8, $f0 \r\n"
960 "punpcklbh $f10, $f10, $f0 \r\n"
961 "punpcklbh $f12, $f12, $f0 \r\n"
962 "punpcklbh $f14, $f14, $f0 \r\n"
963 "punpcklbh $f16, $f16, $f0 \r\n"
964 "punpcklbh $f18, $f18, $f0 \r\n"
965 "paddsh $f20, $f6, $f8 \r\n"
966 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
967 "paddsh $f22, $f4, $f10 \r\n"
968 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
969 "psubsh $f24, $f20, $f22 \r\n"
970 "paddsh $f24, $f24, $f2 \r\n"
971 "paddsh $f24, $f24, $f12 \r\n"
972 "paddsh $f20, $f8, $f10 \r\n"
973 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
974 "paddsh $f22, $f6, $f12 \r\n"
975 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
976 "psubsh $f26, $f20, $f22 \r\n"
977 "paddsh $f26, $f26, $f4 \r\n"
978 "paddsh $f26, $f26, $f14 \r\n"
979 "paddsh $f20, $f10, $f12 \r\n"
980 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
981 "paddsh $f22, $f8, $f14 \r\n"
982 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
983 "psubsh $f28, $f20, $f22 \r\n"
984 "paddsh $f28, $f28, $f6 \r\n"
985 "paddsh $f28, $f28, $f16 \r\n"
986 "paddsh $f20, $f12, $f14 \r\n"
987 "pmullh $f20, $f20, %[ff_pw_20] \r\n"
988 "paddsh $f22, $f10, $f16 \r\n"
989 "pmullh $f22, $f22, %[ff_pw_5] \r\n"
990 "psubsh $f30, $f20, $f22 \r\n"
991 "paddsh $f30, $f30, $f8 \r\n"
992 "paddsh $f30, $f30, $f18 \r\n"
993 "paddsh $f24, $f24, %[ff_pw_16] \r\n"
994 "paddsh $f26, $f26, %[ff_pw_16] \r\n"
995 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
996 "paddsh $f30, $f30, %[ff_pw_16] \r\n"
997 "psrah $f24, $f24, %[ff_pw_5] \r\n"
998 "psrah $f26, $f26, %[ff_pw_5] \r\n"
999 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1000 "psrah $f30, $f30, %[ff_pw_5] \r\n"
1001 "packushb $f24, $f24, $f0 \r\n"
1002 "packushb $f26, $f26, $f0 \r\n"
1003 "packushb $f28, $f28, $f0 \r\n"
1004 "packushb $f30, $f30, $f0 \r\n"
1005 "lwc1 $f2, 0(%[dst0]) \r\n"
1006 "lwc1 $f4, 0(%[dst1]) \r\n"
1007 "lwc1 $f6, 0(%[dst2]) \r\n"
1008 "lwc1 $f8, 0(%[dst3]) \r\n"
1009 "pavgb $f24, $f2, $f24 \r\n"
1010 "pavgb $f26, $f4, $f26 \r\n"
1011 "pavgb $f28, $f6, $f28 \r\n"
1012 "pavgb $f30, $f8, $f30 \r\n"
1013 "swc1 $f24, 0(%[dst0]) \r\n"
1014 "swc1 $f26, 0(%[dst1]) \r\n"
1015 "swc1 $f28, 0(%[dst2]) \r\n"
1016 "swc1 $f30, 0(%[dst3]) \r\n"
1017 ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride),
1018 [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride),
1019 [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride),
1020 [src0]"r"(src), [src1]"r"(src+srcStride),
1021 [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride),
1022 [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride),
1023 [src6]"r"(src+6*srcStride), [ff_pw_20]"f"(ff_pw_20),
1024 [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16)
1025 : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18",
1026 "$f20","$f22","$f24","$f26","$f28","$f30"
1030 static void avg_h264_qpel8_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1031 int dstStride, int srcStride)
1034 "xor $f0, $f0, $f0 \r\n"
1035 "gsldlc1 $f2, 7(%[srcB]) \r\n"
1036 "gsldrc1 $f2, 0(%[srcB]) \r\n"
1037 "gsldlc1 $f4, 7(%[srcA]) \r\n"
1038 "gsldrc1 $f4, 0(%[srcA]) \r\n"
1039 "gsldlc1 $f6, 7(%[src0]) \r\n"
1040 "gsldrc1 $f6, 0(%[src0]) \r\n"
1041 "gsldlc1 $f8, 7(%[src1]) \r\n"
1042 "gsldrc1 $f8, 0(%[src1]) \r\n"
1043 "gsldlc1 $f10, 7(%[src2]) \r\n"
1044 "gsldrc1 $f10, 0(%[src2]) \r\n"
1045 "gsldlc1 $f12, 7(%[src3]) \r\n"
1046 "gsldrc1 $f12, 0(%[src3]) \r\n"
1047 "gsldlc1 $f14, 7(%[src4]) \r\n"
1048 "gsldrc1 $f14, 0(%[src4]) \r\n"
1049 "gsldlc1 $f16, 7(%[src5]) \r\n"
1050 "gsldrc1 $f16, 0(%[src5]) \r\n"
1051 "gsldlc1 $f18, 7(%[src6]) \r\n"
1052 "gsldrc1 $f18, 0(%[src6]) \r\n"
1053 "gsldlc1 $f20, 7(%[src7]) \r\n"
1054 "gsldrc1 $f20, 0(%[src7]) \r\n"
1055 "gsldlc1 $f22, 7(%[src8]) \r\n"
1056 "gsldrc1 $f22, 0(%[src8]) \r\n"
1057 "gsldlc1 $f24, 7(%[src9]) \r\n"
1058 "gsldrc1 $f24, 0(%[src9]) \r\n"
1059 "gsldlc1 $f26, 7(%[src10]) \r\n"
1060 "gsldrc1 $f26, 0(%[src10]) \r\n"
1061 "punpcklbh $f1, $f2, $f0 \r\n"
1062 "punpckhbh $f2, $f2, $f0 \r\n"
1063 "punpcklbh $f3, $f4, $f0 \r\n"
1064 "punpckhbh $f4, $f4, $f0 \r\n"
1065 "punpcklbh $f5, $f6, $f0 \r\n"
1066 "punpckhbh $f6, $f6, $f0 \r\n"
1067 "punpcklbh $f7, $f8, $f0 \r\n"
1068 "punpckhbh $f8, $f8, $f0 \r\n"
1069 "punpcklbh $f9, $f10, $f0 \r\n"
1070 "punpckhbh $f10, $f10, $f0 \r\n"
1071 "punpcklbh $f11, $f12, $f0 \r\n"
1072 "punpckhbh $f12, $f12, $f0 \r\n"
1073 "punpcklbh $f13, $f14, $f0 \r\n"
1074 "punpckhbh $f14, $f14, $f0 \r\n"
1075 "punpcklbh $f15, $f16, $f0 \r\n"
1076 "punpckhbh $f16, $f16, $f0 \r\n"
1077 "punpcklbh $f17, $f18, $f0 \r\n"
1078 "punpckhbh $f18, $f18, $f0 \r\n"
1079 "punpcklbh $f19, $f20, $f0 \r\n"
1080 "punpckhbh $f20, $f20, $f0 \r\n"
1081 "punpcklbh $f21, $f22, $f0 \r\n"
1082 "punpckhbh $f22, $f22, $f0 \r\n"
1083 "punpcklbh $f23, $f24, $f0 \r\n"
1084 "punpckhbh $f24, $f24, $f0 \r\n"
1085 "punpcklbh $f25, $f26, $f0 \r\n"
1086 "punpckhbh $f26, $f26, $f0 \r\n"
1087 "paddsh $f27, $f5, $f7 \r\n"
1088 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1089 "paddsh $f28, $f6, $f8 \r\n"//src0+src1
1090 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1091 "psubsh $f27, $f27, $f3 \r\n"
1092 "psubsh $f28, $f28, $f4 \r\n"
1093 "psubsh $f27, $f27, $f9 \r\n"
1094 "psubsh $f28, $f28, $f10 \r\n"
1095 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1096 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1097 "paddsh $f27, $f27, $f1 \r\n"
1098 "paddsh $f28, $f28, $f2 \r\n"
1099 "paddsh $f27, $f27, $f11 \r\n"
1100 "paddsh $f28, $f28, $f12 \r\n"
1101 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1102 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1103 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1104 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1105 "packushb $f27, $f27, $f0 \r\n"
1106 "packushb $f28, $f28, $f0 \r\n"
1107 "punpcklwd $f2, $f27, $f28 \r\n"
1108 "ldc1 $f28, 0(%[dst0]) \r\n"
1109 "pavgb $f2, $f2, $f28 \r\n"
1110 "sdc1 $f2, 0(%[dst0]) \r\n"
1111 "paddsh $f27, $f7, $f9 \r\n"
1112 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1113 "paddsh $f28, $f8, $f10 \r\n"//src1+src2
1114 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1115 "psubsh $f27, $f27, $f5 \r\n"
1116 "psubsh $f28, $f28, $f6 \r\n"
1117 "psubsh $f27, $f27, $f11 \r\n"
1118 "psubsh $f28, $f28, $f12 \r\n"
1119 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1120 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1121 "paddsh $f27, $f27, $f3 \r\n"
1122 "paddsh $f28, $f28, $f4 \r\n"
1123 "paddsh $f27, $f27, $f13 \r\n"
1124 "paddsh $f28, $f28, $f14 \r\n"
1125 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1126 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1127 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1128 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1129 "packushb $f27, $f27, $f0 \r\n"
1130 "packushb $f28, $f28, $f0 \r\n"
1131 "punpcklwd $f4, $f27, $f28 \r\n"
1132 "ldc1 $f28, 0(%[dst1]) \r\n"
1133 "pavgb $f4, $f4, $f28 \r\n"
1134 "sdc1 $f4, 0(%[dst1]) \r\n"
1135 "paddsh $f27, $f9, $f11 \r\n"
1136 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1137 "paddsh $f28, $f10, $f12 \r\n"//src2+src3
1138 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1139 "psubsh $f27, $f27, $f7 \r\n"
1140 "psubsh $f28, $f28, $f8 \r\n"
1141 "psubsh $f27, $f27, $f13 \r\n"
1142 "psubsh $f28, $f28, $f14 \r\n"
1143 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1144 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1145 "paddsh $f27, $f27, $f5 \r\n"
1146 "paddsh $f28, $f28, $f6 \r\n"
1147 "paddsh $f27, $f27, $f15 \r\n"
1148 "paddsh $f28, $f28, $f16 \r\n"
1149 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1150 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1151 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1152 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1153 "packushb $f27, $f27, $f0 \r\n"
1154 "packushb $f28, $f28, $f0 \r\n"
1155 "punpcklwd $f6, $f27, $f28 \r\n"
1156 "ldc1 $f28, 0(%[dst2]) \r\n"
1157 "pavgb $f6, $f6, $f28 \r\n"
1158 "sdc1 $f6, 0(%[dst2]) \r\n"
1159 "paddsh $f27, $f11, $f13 \r\n"
1160 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1161 "paddsh $f28, $f12, $f14 \r\n"//src3+src4
1162 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1163 "psubsh $f27, $f27, $f9 \r\n"
1164 "psubsh $f28, $f28, $f10 \r\n"
1165 "psubsh $f27, $f27, $f15 \r\n"
1166 "psubsh $f28, $f28, $f16 \r\n"
1167 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1168 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1169 "paddsh $f27, $f27, $f7 \r\n"
1170 "paddsh $f28, $f28, $f8 \r\n"
1171 "paddsh $f27, $f27, $f17 \r\n"
1172 "paddsh $f28, $f28, $f18 \r\n"
1173 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1174 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1175 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1176 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1177 "packushb $f27, $f27, $f0 \r\n"
1178 "packushb $f28, $f28, $f0 \r\n"
1179 "punpcklwd $f8, $f27, $f28 \r\n"
1180 "ldc1 $f28, 0(%[dst3]) \r\n"
1181 "pavgb $f8, $f8, $f28 \r\n"
1182 "sdc1 $f8, 0(%[dst3]) \r\n"
1183 "paddsh $f27, $f13, $f15 \r\n"
1184 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1185 "paddsh $f28, $f14, $f16 \r\n"//src4+src5
1186 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1187 "psubsh $f27, $f27, $f11 \r\n"
1188 "psubsh $f28, $f28, $f12 \r\n"
1189 "psubsh $f27, $f27, $f17 \r\n"
1190 "psubsh $f28, $f28, $f18 \r\n"
1191 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1192 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1193 "paddsh $f27, $f27, $f9 \r\n"
1194 "paddsh $f28, $f28, $f10 \r\n"
1195 "paddsh $f27, $f27, $f19 \r\n"
1196 "paddsh $f28, $f28, $f20 \r\n"
1197 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1198 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1199 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1200 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1201 "packushb $f27, $f27, $f0 \r\n"
1202 "packushb $f28, $f28, $f0 \r\n"
1203 "punpcklwd $f10, $f27, $f28 \r\n"
1204 "ldc1 $f28, 0(%[dst4]) \r\n"
1205 "pavgb $f10, $f10, $f28 \r\n"
1206 "sdc1 $f10, 0(%[dst4]) \r\n"
1207 "paddsh $f27, $f15, $f17 \r\n"
1208 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1209 "paddsh $f28, $f16, $f18 \r\n"//src5+src6
1210 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1211 "psubsh $f27, $f27, $f13 \r\n"
1212 "psubsh $f28, $f28, $f14 \r\n"
1213 "psubsh $f27, $f27, $f19 \r\n"
1214 "psubsh $f28, $f28, $f20 \r\n"
1215 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1216 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1217 "paddsh $f27, $f27, $f11 \r\n"
1218 "paddsh $f28, $f28, $f12 \r\n"
1219 "paddsh $f27, $f27, $f21 \r\n"
1220 "paddsh $f28, $f28, $f22 \r\n"
1221 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1222 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1223 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1224 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1225 "packushb $f27, $f27, $f0 \r\n"
1226 "packushb $f28, $f28, $f0 \r\n"
1227 "punpcklwd $f12, $f27, $f28 \r\n"
1228 "ldc1 $f28, 0(%[dst5]) \r\n"
1229 "pavgb $f12, $f12, $f28 \r\n"
1230 "sdc1 $f12, 0(%[dst5]) \r\n"
1231 "paddsh $f27, $f17, $f19 \r\n"
1232 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1233 "paddsh $f28, $f18, $f20 \r\n"//src6+src7
1234 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1235 "psubsh $f27, $f27, $f15 \r\n"
1236 "psubsh $f28, $f28, $f16 \r\n"
1237 "psubsh $f27, $f27, $f21 \r\n"
1238 "psubsh $f28, $f28, $f22 \r\n"
1239 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1240 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1241 "paddsh $f27, $f27, $f13 \r\n"
1242 "paddsh $f28, $f28, $f14 \r\n"
1243 "paddsh $f27, $f27, $f23 \r\n"
1244 "paddsh $f28, $f28, $f24 \r\n"
1245 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1246 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1247 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1248 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1249 "packushb $f27, $f27, $f0 \r\n"
1250 "packushb $f28, $f28, $f0 \r\n"
1251 "punpcklwd $f14, $f27, $f28 \r\n"
1252 "ldc1 $f28, 0(%[dst6]) \r\n"
1253 "pavgb $f14, $f14, $f28 \r\n"
1254 "sdc1 $f14, 0(%[dst6]) \r\n"
1255 "paddsh $f27, $f19, $f21 \r\n"
1256 "pmullh $f27, $f27, %[ff_pw_4] \r\n"
1257 "paddsh $f28, $f20, $f22 \r\n"//src7+src8
1258 "pmullh $f28, $f28, %[ff_pw_4] \r\n"
1259 "psubsh $f27, $f27, $f17 \r\n"
1260 "psubsh $f28, $f28, $f18 \r\n"
1261 "psubsh $f27, $f27, $f23 \r\n"
1262 "psubsh $f28, $f28, $f24 \r\n"
1263 "pmullh $f27, $f27, %[ff_pw_5] \r\n"
1264 "pmullh $f28, $f28, %[ff_pw_5] \r\n"
1265 "paddsh $f27, $f27, $f15 \r\n"
1266 "paddsh $f28, $f28, $f16 \r\n"
1267 "paddsh $f27, $f27, $f25 \r\n"
1268 "paddsh $f28, $f28, $f26 \r\n"
1269 "paddsh $f27, $f27, %[ff_pw_16] \r\n"
1270 "paddsh $f28, $f28, %[ff_pw_16] \r\n"
1271 "psrah $f27, $f27, %[ff_pw_5] \r\n"
1272 "psrah $f28, $f28, %[ff_pw_5] \r\n"
1273 "packushb $f27, $f27, $f0 \r\n"
1274 "packushb $f28, $f28, $f0 \r\n"
1275 "punpcklwd $f16, $f27, $f28 \r\n"
1276 "ldc1 $f28, 0(%[dst7]) \r\n"
1277 "pavgb $f16, $f16, $f28 \r\n"
1278 "sdc1 $f16, 0(%[dst7]) \r\n"
1279 ::[dst0]"r"(dst), [dst1]"r"(dst+dstStride),
1280 [dst2]"r"(dst+2*dstStride), [dst3]"r"(dst+3*dstStride),
1281 [dst4]"r"(dst+4*dstStride), [dst5]"r"(dst+5*dstStride),
1282 [dst6]"r"(dst+6*dstStride), [dst7]"r"(dst+7*dstStride),
1283 [srcB]"r"(src-2*srcStride), [srcA]"r"(src-srcStride),
1284 [src0]"r"(src), [src1]"r"(src+srcStride),
1285 [src2]"r"(src+2*srcStride), [src3]"r"(src+3*srcStride),
1286 [src4]"r"(src+4*srcStride), [src5]"r"(src+5*srcStride),
1287 [src6]"r"(src+6*srcStride), [src7]"r"(src+7*srcStride),
1288 [src8]"r"(src+8*srcStride), [src9]"r"(src+9*srcStride),
1289 [src10]"r"(src+10*srcStride), [ff_pw_4]"f"(ff_pw_4),
1290 [ff_pw_5]"f"(ff_pw_5), [ff_pw_16]"f"(ff_pw_16)
1291 : "$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9","$f10",
1292 "$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18","$f19",
1293 "$f20","$f21","$f22","$f23","$f24","$f25","$f26","$f27","$f28"
1297 static void avg_h264_qpel16_v_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1298 int dstStride, int srcStride)
1300 avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
1301 avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1304 avg_h264_qpel8_v_lowpass_mmi(dst, src, dstStride, srcStride);
1305 avg_h264_qpel8_v_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1308 static void put_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1309 int dstStride, int srcStride)
1313 int16_t *tmp = _tmp;
1316 "xor $f0, $f0, $f0 \r\n"
1319 "gslwlc1 $f2, 1(%[src]) \r\n"
1320 "gslwrc1 $f2, -2(%[src]) \r\n"
1321 "gslwlc1 $f4, 2(%[src]) \r\n"
1322 "gslwrc1 $f4, -1(%[src]) \r\n"
1323 "gslwlc1 $f6, 3(%[src]) \r\n"
1324 "gslwrc1 $f6, 0(%[src]) \r\n"
1325 "gslwlc1 $f8, 4(%[src]) \r\n"
1326 "gslwrc1 $f8, 1(%[src]) \r\n"
1327 "gslwlc1 $f10, 5(%[src]) \r\n"
1328 "gslwrc1 $f10, 2(%[src]) \r\n"
1329 "gslwlc1 $f12, 6(%[src]) \r\n"
1330 "gslwrc1 $f12, 3(%[src]) \r\n"
1331 "punpcklbh $f2, $f2, $f0 \r\n"
1332 "punpcklbh $f4, $f4, $f0 \r\n"
1333 "punpcklbh $f6, $f6, $f0 \r\n"
1334 "punpcklbh $f8, $f8, $f0 \r\n"
1335 "punpcklbh $f10, $f10, $f0 \r\n"
1336 "punpcklbh $f12, $f12, $f0 \r\n"
1337 "paddsh $f14, $f6, $f8 \r\n"
1338 "paddsh $f16, $f4, $f10 \r\n"
1339 "paddsh $f18, $f2, $f12 \r\n"
1340 "pmullh $f14, $f14, %[ff_pw_20] \r\n"
1341 "pmullh $f16, $f16, %[ff_pw_5] \r\n"
1342 "psubsh $f14, $f14, $f16 \r\n"
1343 "paddsh $f18, $f14, $f18 \r\n"
1344 "sdc1 $f18, 0(%[tmp]) \r\n"
1345 "dadd %[tmp], %[tmp], %[tmpStride] \r\n"
1346 "dadd %[src], %[src], %[srcStride] \r\n"
1347 "daddi $8, $8, -1 \r\n"
1349 : [tmp]"+&r"(tmp),[src]"+&r"(src)
1350 : [tmpStride]"r"(8),[srcStride]"r"(srcStride),
1351 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
1352 : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18"
1357 for(i=0; i<4; i++) {
1358 const int16_t tmpB= tmp[-8];
1359 const int16_t tmpA= tmp[-4];
1360 const int16_t tmp0= tmp[ 0];
1361 const int16_t tmp1= tmp[ 4];
1362 const int16_t tmp2= tmp[ 8];
1363 const int16_t tmp3= tmp[12];
1364 const int16_t tmp4= tmp[16];
1365 const int16_t tmp5= tmp[20];
1366 const int16_t tmp6= tmp[24];
1367 op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1368 op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1369 op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1370 op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1376 static void put_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1377 int dstStride, int srcStride)
1380 int16_t *tmp = _tmp;
1385 "xor $f0, $f0, $f0 \r\n"
1388 "gsldlc1 $f2, 5(%[src]) \r\n"
1389 "gsldrc1 $f2, -2(%[src]) \r\n"
1390 "gsldlc1 $f4, 6(%[src]) \r\n"
1391 "gsldrc1 $f4, -1(%[src]) \r\n"
1392 "gsldlc1 $f6, 7(%[src]) \r\n"
1393 "gsldrc1 $f6, 0(%[src]) \r\n"
1394 "gsldlc1 $f8, 8(%[src]) \r\n"
1395 "gsldrc1 $f8, 1(%[src]) \r\n"
1396 "gsldlc1 $f10, 9(%[src]) \r\n"
1397 "gsldrc1 $f10, 2(%[src]) \r\n"
1398 "gsldlc1 $f12, 10(%[src]) \r\n"
1399 "gsldrc1 $f12, 3(%[src]) \r\n"
1400 "punpcklbh $f1, $f2, $f0 \r\n"
1401 "punpcklbh $f3, $f4, $f0 \r\n"
1402 "punpcklbh $f5, $f6, $f0 \r\n"
1403 "punpcklbh $f7, $f8, $f0 \r\n"
1404 "punpcklbh $f9, $f10, $f0 \r\n"
1405 "punpcklbh $f11, $f12, $f0 \r\n"
1406 "punpckhbh $f2, $f2, $f0 \r\n"
1407 "punpckhbh $f4, $f4, $f0 \r\n"
1408 "punpckhbh $f6, $f6, $f0 \r\n"
1409 "punpckhbh $f8, $f8, $f0 \r\n"
1410 "punpckhbh $f10, $f10, $f0 \r\n"
1411 "punpckhbh $f12, $f12, $f0 \r\n"
1412 "paddsh $f13, $f5, $f7 \r\n"
1413 "paddsh $f15, $f3, $f9 \r\n"
1414 "paddsh $f17, $f1, $f11 \r\n"
1415 "pmullh $f13, $f13, %[ff_pw_20] \r\n"
1416 "pmullh $f15, $f15, %[ff_pw_5] \r\n"
1417 "psubsh $f13, $f13, $f15 \r\n"
1418 "paddsh $f17, $f13, $f17 \r\n"
1419 "paddsh $f14, $f6, $f8 \r\n"
1420 "paddsh $f16, $f4, $f10 \r\n"
1421 "paddsh $f18, $f2, $f12 \r\n"
1422 "pmullh $f14, $f14, %[ff_pw_20] \r\n"
1423 "pmullh $f16, $f16, %[ff_pw_5] \r\n"
1424 "psubsh $f14, $f14, $f16 \r\n"
1425 "paddsh $f18, $f14, $f18 \r\n"
1426 "sdc1 $f17, 0(%[tmp]) \r\n"
1427 "sdc1 $f18, 8(%[tmp]) \r\n"
1428 "dadd %[tmp], %[tmp], %[tmpStride] \r\n"
1429 "dadd %[src], %[src], %[srcStride] \r\n"
1430 "daddi $8, $8, -1 \r\n"
1432 : [tmp]"+&r"(tmp),[src]"+&r"(src)
1433 : [tmpStride]"r"(16),[srcStride]"r"(srcStride),
1434 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
1435 : "$8","$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9",
1436 "$f10","$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18"
1441 for(i=0; i<8; i++) {
1442 const int tmpB= tmp[-16];
1443 const int tmpA= tmp[ -8];
1444 const int tmp0= tmp[ 0];
1445 const int tmp1= tmp[ 8];
1446 const int tmp2= tmp[ 16];
1447 const int tmp3= tmp[ 24];
1448 const int tmp4= tmp[ 32];
1449 const int tmp5= tmp[ 40];
1450 const int tmp6= tmp[ 48];
1451 const int tmp7= tmp[ 56];
1452 const int tmp8= tmp[ 64];
1453 const int tmp9= tmp[ 72];
1454 const int tmp10=tmp[ 80];
1455 op2_put(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1456 op2_put(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1457 op2_put(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1458 op2_put(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1459 op2_put(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));
1460 op2_put(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));
1461 op2_put(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));
1462 op2_put(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));
1468 static void put_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1469 int dstStride, int srcStride)
1471 put_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
1472 put_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1475 put_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
1476 put_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1479 static void avg_h264_qpel4_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1480 int dstStride, int srcStride)
1484 int16_t *tmp = _tmp;
1488 "xor $f0, $f0, $f0 \r\n"
1491 "gslwlc1 $f2, 1(%[src]) \r\n"
1492 "gslwrc1 $f2, -2(%[src]) \r\n"
1493 "gslwlc1 $f4, 2(%[src]) \r\n"
1494 "gslwrc1 $f4, -1(%[src]) \r\n"
1495 "gslwlc1 $f6, 3(%[src]) \r\n"
1496 "gslwrc1 $f6, 0(%[src]) \r\n"
1497 "gslwlc1 $f8, 4(%[src]) \r\n"
1498 "gslwrc1 $f8, 1(%[src]) \r\n"
1499 "gslwlc1 $f10, 5(%[src]) \r\n"
1500 "gslwrc1 $f10, 2(%[src]) \r\n"
1501 "gslwlc1 $f12, 6(%[src]) \r\n"
1502 "gslwrc1 $f12, 3(%[src]) \r\n"
1503 "punpcklbh $f2, $f2, $f0 \r\n"
1504 "punpcklbh $f4, $f4, $f0 \r\n"
1505 "punpcklbh $f6, $f6, $f0 \r\n"
1506 "punpcklbh $f8, $f8, $f0 \r\n"
1507 "punpcklbh $f10, $f10, $f0 \r\n"
1508 "punpcklbh $f12, $f12, $f0 \r\n"
1509 "paddsh $f14, $f6, $f8 \r\n"
1510 "paddsh $f16, $f4, $f10 \r\n"
1511 "paddsh $f18, $f2, $f12 \r\n"
1512 "pmullh $f14, $f14, %[ff_pw_20] \r\n"
1513 "pmullh $f16, $f16, %[ff_pw_5] \r\n"
1514 "psubsh $f14, $f14, $f16 \r\n"
1515 "paddsh $f18, $f14, $f18 \r\n"
1516 "sdc1 $f18, 0(%[tmp]) \r\n"
1517 "dadd %[tmp], %[tmp], %[tmpStride] \r\n"
1518 "dadd %[src], %[src], %[srcStride] \r\n"
1519 "daddi $8, $8, -1 \r\n"
1521 : [tmp]"+&r"(tmp),[src]"+&r"(src)
1522 : [tmpStride]"r"(8),[srcStride]"r"(srcStride),
1523 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
1524 : "$8","$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18"
1531 const int16_t tmpB= tmp[-8];
1532 const int16_t tmpA= tmp[-4];
1533 const int16_t tmp0= tmp[ 0];
1534 const int16_t tmp1= tmp[ 4];
1535 const int16_t tmp2= tmp[ 8];
1536 const int16_t tmp3= tmp[12];
1537 const int16_t tmp4= tmp[16];
1538 const int16_t tmp5= tmp[20];
1539 const int16_t tmp6= tmp[24];
1540 op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1541 op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1542 op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1543 op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1549 static void avg_h264_qpel8_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1550 int dstStride, int srcStride)
1553 int16_t *tmp = _tmp;
1558 "xor $f0, $f0, $f0 \r\n"
1561 "gsldlc1 $f2, 5(%[src]) \r\n"
1562 "gsldrc1 $f2, -2(%[src]) \r\n"
1563 "gsldlc1 $f4, 6(%[src]) \r\n"
1564 "gsldrc1 $f4, -1(%[src]) \r\n"
1565 "gsldlc1 $f6, 7(%[src]) \r\n"
1566 "gsldrc1 $f6, 0(%[src]) \r\n"
1567 "gsldlc1 $f8, 8(%[src]) \r\n"
1568 "gsldrc1 $f8, 1(%[src]) \r\n"
1569 "gsldlc1 $f10, 9(%[src]) \r\n"
1570 "gsldrc1 $f10, 2(%[src]) \r\n"
1571 "gsldlc1 $f12, 10(%[src]) \r\n"
1572 "gsldrc1 $f12, 3(%[src]) \r\n"
1573 "punpcklbh $f1, $f2, $f0 \r\n"
1574 "punpcklbh $f3, $f4, $f0 \r\n"
1575 "punpcklbh $f5, $f6, $f0 \r\n"
1576 "punpcklbh $f7, $f8, $f0 \r\n"
1577 "punpcklbh $f9, $f10, $f0 \r\n"
1578 "punpcklbh $f11, $f12, $f0 \r\n"
1579 "punpckhbh $f2, $f2, $f0 \r\n"
1580 "punpckhbh $f4, $f4, $f0 \r\n"
1581 "punpckhbh $f6, $f6, $f0 \r\n"
1582 "punpckhbh $f8, $f8, $f0 \r\n"
1583 "punpckhbh $f10, $f10, $f0 \r\n"
1584 "punpckhbh $f12, $f12, $f0 \r\n"
1585 "paddsh $f13, $f5, $f7 \r\n"
1586 "paddsh $f15, $f3, $f9 \r\n"
1587 "paddsh $f17, $f1, $f11 \r\n"
1588 "pmullh $f13, $f13, %[ff_pw_20] \r\n"
1589 "pmullh $f15, $f15, %[ff_pw_5] \r\n"
1590 "psubsh $f13, $f13, $f15 \r\n"
1591 "paddsh $f17, $f13, $f17 \r\n"
1592 "paddsh $f14, $f6, $f8 \r\n"
1593 "paddsh $f16, $f4, $f10 \r\n"
1594 "paddsh $f18, $f2, $f12 \r\n"
1595 "pmullh $f14, $f14, %[ff_pw_20] \r\n"
1596 "pmullh $f16, $f16, %[ff_pw_5] \r\n"
1597 "psubsh $f14, $f14, $f16 \r\n"
1598 "paddsh $f18, $f14, $f18 \r\n"
1600 "sdc1 $f17, 0(%[tmp]) \r\n"
1601 "sdc1 $f18, 8(%[tmp]) \r\n"
1602 "dadd %[tmp], %[tmp], %[tmpStride] \r\n"
1603 "dadd %[src], %[src], %[srcStride] \r\n"
1604 "daddi $8, $8, -1 \r\n"
1606 : [tmp]"+&r"(tmp),[src]"+&r"(src)
1607 : [tmpStride]"r"(16),[srcStride]"r"(srcStride),
1608 [ff_pw_20]"f"(ff_pw_20),[ff_pw_5]"f"(ff_pw_5)
1609 : "$8","$f0","$f1","$f2","$f3","$f4","$f5","$f6","$f7","$f8","$f9",
1610 "$f10","$f11","$f12","$f13","$f14","$f15","$f16","$f17","$f18"
1615 for(i=0; i<8; i++) {
1616 const int tmpB= tmp[-16];
1617 const int tmpA= tmp[ -8];
1618 const int tmp0= tmp[ 0];
1619 const int tmp1= tmp[ 8];
1620 const int tmp2= tmp[ 16];
1621 const int tmp3= tmp[ 24];
1622 const int tmp4= tmp[ 32];
1623 const int tmp5= tmp[ 40];
1624 const int tmp6= tmp[ 48];
1625 const int tmp7= tmp[ 56];
1626 const int tmp8= tmp[ 64];
1627 const int tmp9= tmp[ 72];
1628 const int tmp10=tmp[ 80];
1629 op2_avg(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));
1630 op2_avg(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));
1631 op2_avg(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));
1632 op2_avg(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));
1633 op2_avg(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));
1634 op2_avg(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));
1635 op2_avg(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));
1636 op2_avg(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));
1642 static void avg_h264_qpel16_hv_lowpass_mmi(uint8_t *dst, const uint8_t *src,
1643 int dstStride, int srcStride){
1644 avg_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
1645 avg_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1648 avg_h264_qpel8_hv_lowpass_mmi(dst, src, dstStride, srcStride);
1649 avg_h264_qpel8_hv_lowpass_mmi(dst+8, src+8, dstStride, srcStride);
1652 //DEF_H264_MC_MMI(put_, 4)
1653 void ff_put_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
1656 put_pixels4_mmi(dst, src, stride, 4);
1659 void ff_put_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
1663 put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
1664 put_pixels4_l2_mmi(dst, src, half, stride, stride, 4, 4);
1667 void ff_put_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
1670 put_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
1673 void ff_put_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
1677 put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
1678 put_pixels4_l2_mmi(dst, src+1, half, stride, stride, 4, 4);
1681 void ff_put_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
1685 uint8_t * const full_mid= full + 8;
1687 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1688 put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
1689 put_pixels4_l2_mmi(dst, full_mid, half, stride, 4, 4, 4);
1692 void ff_put_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
1696 uint8_t * const full_mid= full + 8;
1697 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1698 put_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
1701 void ff_put_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
1705 uint8_t * const full_mid= full + 8;
1707 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1708 put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
1709 put_pixels4_l2_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
1712 void ff_put_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
1716 uint8_t * const full_mid= full + 8;
1719 put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
1720 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1721 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1722 put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1725 void ff_put_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
1729 uint8_t * const full_mid= full + 8;
1732 put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
1733 copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9);
1734 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1735 put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1738 void ff_put_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
1742 uint8_t * const full_mid= full + 8;
1745 put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
1746 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1747 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1748 put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1751 void ff_put_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
1755 uint8_t * const full_mid= full + 8;
1758 put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
1759 copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9);
1760 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1761 put_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1764 void ff_put_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
1767 put_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
1770 void ff_put_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
1775 put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
1776 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1777 put_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
1780 void ff_put_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
1785 put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
1786 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1787 put_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
1790 void ff_put_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
1794 uint8_t * const full_mid= full + 8;
1797 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1798 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1799 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1800 put_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
1803 void ff_put_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
1807 uint8_t * const full_mid= full + 8;
1810 copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9);
1811 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1812 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1813 put_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
1816 //DEF_H264_MC_MMI(avg_, 4)
1817 void ff_avg_h264_qpel4_mc00_mmi(uint8_t *dst, const uint8_t *src,
1820 avg_pixels4_mmi(dst, src, stride, 4);
1823 void ff_avg_h264_qpel4_mc10_mmi(uint8_t *dst, const uint8_t *src,
1827 put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
1828 avg_pixels4_l2_mmi(dst, src, half, stride, stride, 4, 4);
1831 void ff_avg_h264_qpel4_mc20_mmi(uint8_t *dst, const uint8_t *src,
1834 avg_h264_qpel4_h_lowpass_mmi(dst, src, stride, stride);
1837 void ff_avg_h264_qpel4_mc30_mmi(uint8_t *dst, const uint8_t *src,
1841 put_h264_qpel4_h_lowpass_mmi(half, src, 4, stride);
1842 avg_pixels4_l2_mmi(dst, src+1, half, stride, stride, 4, 4);
1845 void ff_avg_h264_qpel4_mc01_mmi(uint8_t *dst, const uint8_t *src,
1849 uint8_t * const full_mid= full + 8;
1851 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1852 put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
1853 avg_pixels4_l2_mmi(dst, full_mid, half, stride, 4, 4, 4);
1856 void ff_avg_h264_qpel4_mc02_mmi(uint8_t *dst, const uint8_t *src,
1860 uint8_t * const full_mid= full + 8;
1861 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1862 avg_h264_qpel4_v_lowpass_mmi(dst, full_mid, stride, 4);
1865 void ff_avg_h264_qpel4_mc03_mmi(uint8_t *dst, const uint8_t *src,
1869 uint8_t * const full_mid= full + 8;
1871 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1872 put_h264_qpel4_v_lowpass_mmi(half, full_mid, 4, 4);
1873 avg_pixels4_l2_mmi(dst, full_mid+4, half, stride, 4, 4, 4);
1876 void ff_avg_h264_qpel4_mc11_mmi(uint8_t *dst, const uint8_t *src,
1880 uint8_t * const full_mid= full + 8;
1883 put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
1884 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1885 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1886 avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1889 void ff_avg_h264_qpel4_mc31_mmi(uint8_t *dst, const uint8_t *src,
1893 uint8_t * const full_mid= full + 8;
1896 put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
1897 copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9);
1898 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1899 avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1902 void ff_avg_h264_qpel4_mc13_mmi(uint8_t *dst, const uint8_t *src,
1906 uint8_t * const full_mid= full + 8;
1909 put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
1910 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1911 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1912 avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1915 void ff_avg_h264_qpel4_mc33_mmi(uint8_t *dst, const uint8_t *src,
1919 uint8_t * const full_mid= full + 8;
1922 put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
1923 copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9);
1924 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1925 avg_pixels4_l2_mmi(dst, halfH, halfV, stride, 4, 4, 4);
1928 void ff_avg_h264_qpel4_mc22_mmi(uint8_t *dst, const uint8_t *src,
1931 avg_h264_qpel4_hv_lowpass_mmi(dst, src, stride, stride);
1934 void ff_avg_h264_qpel4_mc21_mmi(uint8_t *dst, const uint8_t *src,
1939 put_h264_qpel4_h_lowpass_mmi(halfH, src, 4, stride);
1940 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1941 avg_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
1944 void ff_avg_h264_qpel4_mc23_mmi(uint8_t *dst, const uint8_t *src,
1949 put_h264_qpel4_h_lowpass_mmi(halfH, src + stride, 4, stride);
1950 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1951 avg_pixels4_l2_mmi(dst, halfH, halfHV, stride, 4, 4, 4);
1954 void ff_avg_h264_qpel4_mc12_mmi(uint8_t *dst, const uint8_t *src,
1958 uint8_t * const full_mid= full + 8;
1961 copy_block4_mmi(full, src - stride*2, 4, stride, 9);
1962 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1963 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1964 avg_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
1967 void ff_avg_h264_qpel4_mc32_mmi(uint8_t *dst, const uint8_t *src,
1971 uint8_t * const full_mid= full + 8;
1974 copy_block4_mmi(full, src - stride*2 + 1, 4, stride, 9);
1975 put_h264_qpel4_v_lowpass_mmi(halfV, full_mid, 4, 4);
1976 put_h264_qpel4_hv_lowpass_mmi(halfHV, src, 4, stride);
1977 avg_pixels4_l2_mmi(dst, halfV, halfHV, stride, 4, 4, 4);
1980 //DEF_H264_MC_MMI(put_, 8)
1981 void ff_put_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
1984 put_pixels8_mmi(dst, src, stride, 8);
1987 void ff_put_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
1991 put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
1992 put_pixels8_l2_mmi(dst, src, half, stride, stride, 8, 8);
1995 void ff_put_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
1998 put_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
2001 void ff_put_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
2005 put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2006 put_pixels8_l2_mmi(dst, src+1, half, stride, stride, 8, 8);
2009 void ff_put_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
2013 uint8_t * const full_mid= full + 16;
2015 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2016 put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2017 put_pixels8_l2_mmi(dst, full_mid, half, stride, 8, 8, 8);
2020 void ff_put_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
2024 uint8_t * const full_mid= full + 16;
2025 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2026 put_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
2029 void ff_put_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
2033 uint8_t * const full_mid= full + 16;
2035 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2036 put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2037 put_pixels8_l2_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
2040 void ff_put_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
2044 uint8_t * const full_mid= full + 16;
2047 put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2048 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2049 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2050 put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2053 void ff_put_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
2057 uint8_t * const full_mid= full + 16;
2060 put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2061 copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13);
2062 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2063 put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2066 void ff_put_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
2070 uint8_t * const full_mid= full + 16;
2073 put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2074 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2075 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2076 put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2079 void ff_put_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
2083 uint8_t * const full_mid= full + 16;
2086 put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2087 copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13);
2088 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2089 put_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2092 void ff_put_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
2095 put_h264_qpel8_hv_lowpass_mmi(dst, src, stride, stride);
2098 void ff_put_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
2103 put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2104 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2105 put_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
2108 void ff_put_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
2113 put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2114 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2115 put_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
2118 void ff_put_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
2122 uint8_t * const full_mid= full + 16;
2125 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2126 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2127 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2128 put_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
2131 void ff_put_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
2135 uint8_t * const full_mid= full + 16;
2138 copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13);
2139 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2140 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2141 put_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
2144 //DEF_H264_MC_MMI(avg_, 8)
2145 void ff_avg_h264_qpel8_mc00_mmi(uint8_t *dst, const uint8_t *src,
2148 avg_pixels8_mmi(dst, src, stride, 8);
2151 void ff_avg_h264_qpel8_mc10_mmi(uint8_t *dst, const uint8_t *src,
2155 put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2156 avg_pixels8_l2_mmi(dst, src, half, stride, stride, 8, 8);
2159 void ff_avg_h264_qpel8_mc20_mmi(uint8_t *dst, const uint8_t *src,
2162 avg_h264_qpel8_h_lowpass_mmi(dst, src, stride, stride);
2165 void ff_avg_h264_qpel8_mc30_mmi(uint8_t *dst, const uint8_t *src,
2169 put_h264_qpel8_h_lowpass_mmi(half, src, 8, stride);
2170 avg_pixels8_l2_mmi(dst, src+1, half, stride, stride, 8, 8);
2173 void ff_avg_h264_qpel8_mc01_mmi(uint8_t *dst, const uint8_t *src,
2177 uint8_t * const full_mid= full + 16;
2179 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2180 put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2181 avg_pixels8_l2_mmi(dst, full_mid, half, stride, 8, 8, 8);
2184 void ff_avg_h264_qpel8_mc02_mmi(uint8_t *dst, const uint8_t *src,
2188 uint8_t * const full_mid= full + 16;
2189 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2190 avg_h264_qpel8_v_lowpass_mmi(dst, full_mid, stride, 8);
2193 void ff_avg_h264_qpel8_mc03_mmi(uint8_t *dst, const uint8_t *src,
2197 uint8_t * const full_mid= full + 16;
2199 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2200 put_h264_qpel8_v_lowpass_mmi(half, full_mid, 8, 8);
2201 avg_pixels8_l2_mmi(dst, full_mid+8, half, stride, 8, 8, 8);
2204 void ff_avg_h264_qpel8_mc11_mmi(uint8_t *dst, const uint8_t *src,
2208 uint8_t * const full_mid= full + 16;
2211 put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2212 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2213 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2214 avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2217 void ff_avg_h264_qpel8_mc31_mmi(uint8_t *dst, const uint8_t *src,
2221 uint8_t * const full_mid= full + 16;
2224 put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2225 copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13);
2226 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2227 avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2230 void ff_avg_h264_qpel8_mc13_mmi(uint8_t *dst, const uint8_t *src,
2234 uint8_t * const full_mid= full + 16;
2237 put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2238 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2239 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2240 avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2243 void ff_avg_h264_qpel8_mc33_mmi(uint8_t *dst, const uint8_t *src,
2247 uint8_t * const full_mid= full + 16;
2250 put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2251 copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13);
2252 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2253 avg_pixels8_l2_mmi(dst, halfH, halfV, stride, 8, 8, 8);
2256 void ff_avg_h264_qpel8_mc22_mmi(uint8_t *dst, const uint8_t *src,
2259 avg_h264_qpel8_hv_lowpass_mmi(dst, src, stride, stride);
2262 void ff_avg_h264_qpel8_mc21_mmi(uint8_t *dst, const uint8_t *src,
2267 put_h264_qpel8_h_lowpass_mmi(halfH, src, 8, stride);
2268 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2269 avg_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
2272 void ff_avg_h264_qpel8_mc23_mmi(uint8_t *dst, const uint8_t *src,
2277 put_h264_qpel8_h_lowpass_mmi(halfH, src + stride, 8, stride);
2278 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2279 avg_pixels8_l2_mmi(dst, halfH, halfHV, stride, 8, 8, 8);
2282 void ff_avg_h264_qpel8_mc12_mmi(uint8_t *dst, const uint8_t *src,
2286 uint8_t * const full_mid= full + 16;
2289 copy_block8_mmi(full, src - stride*2, 8, stride, 13);
2290 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2291 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2292 avg_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
2295 void ff_avg_h264_qpel8_mc32_mmi(uint8_t *dst, const uint8_t *src,
2299 uint8_t * const full_mid= full + 16;
2302 copy_block8_mmi(full, src - stride*2 + 1, 8, stride, 13);
2303 put_h264_qpel8_v_lowpass_mmi(halfV, full_mid, 8, 8);
2304 put_h264_qpel8_hv_lowpass_mmi(halfHV, src, 8, stride);
2305 avg_pixels8_l2_mmi(dst, halfV, halfHV, stride, 8, 8, 8);
2308 //DEF_H264_MC_MMI(put_, 16)
2309 void ff_put_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
2312 put_pixels16_mmi(dst, src, stride, 16);
2315 void ff_put_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
2319 put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2320 put_pixels16_l2_mmi(dst, src, half, stride, stride, 16, 16);
2323 void ff_put_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
2326 put_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
2329 void ff_put_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
2333 put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2334 put_pixels16_l2_mmi(dst, src+1, half, stride, stride, 16, 16);
2337 void ff_put_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
2341 uint8_t * const full_mid= full + 32;
2343 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2344 put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2345 put_pixels16_l2_mmi(dst, full_mid, half, stride, 16, 16, 16);
2348 void ff_put_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
2352 uint8_t * const full_mid= full + 32;
2353 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2354 put_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
2357 void ff_put_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
2361 uint8_t * const full_mid= full + 32;
2363 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2364 put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2365 put_pixels16_l2_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
2368 void ff_put_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
2372 uint8_t * const full_mid= full + 32;
2375 put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2376 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2377 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2378 put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2381 void ff_put_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
2385 uint8_t * const full_mid= full + 32;
2388 put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2389 copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21);
2390 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2391 put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2394 void ff_put_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
2398 uint8_t * const full_mid= full + 32;
2401 put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2402 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2403 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2404 put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2407 void ff_put_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
2411 uint8_t * const full_mid= full + 32;
2414 put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2415 copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21);
2416 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2417 put_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2420 void ff_put_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
2423 put_h264_qpel16_hv_lowpass_mmi(dst, src, stride, stride);
2426 void ff_put_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
2430 uint8_t halfHV[256];
2431 put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2432 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2433 put_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
2436 void ff_put_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
2440 uint8_t halfHV[256];
2441 put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2442 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2443 put_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
2446 void ff_put_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
2450 uint8_t * const full_mid= full + 32;
2452 uint8_t halfHV[256];
2453 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2454 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2455 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2456 put_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
2459 void ff_put_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
2463 uint8_t * const full_mid= full + 32;
2465 uint8_t halfHV[256];
2466 copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21);
2467 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2468 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2469 put_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
2472 //DEF_H264_MC_MMI(avg_, 16)
2473 void ff_avg_h264_qpel16_mc00_mmi(uint8_t *dst, const uint8_t *src,
2476 avg_pixels16_mmi(dst, src, stride, 16);
2479 void ff_avg_h264_qpel16_mc10_mmi(uint8_t *dst, const uint8_t *src,
2483 put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2484 avg_pixels16_l2_mmi(dst, src, half, stride, stride, 16, 16);
2487 void ff_avg_h264_qpel16_mc20_mmi(uint8_t *dst, const uint8_t *src,
2490 avg_h264_qpel16_h_lowpass_mmi(dst, src, stride, stride);
2493 void ff_avg_h264_qpel16_mc30_mmi(uint8_t *dst, const uint8_t *src,
2497 put_h264_qpel16_h_lowpass_mmi(half, src, 16, stride);
2498 avg_pixels16_l2_mmi(dst, src+1, half, stride, stride, 16, 16);
2501 void ff_avg_h264_qpel16_mc01_mmi(uint8_t *dst, const uint8_t *src,
2505 uint8_t * const full_mid= full + 32;
2507 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2508 put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2509 avg_pixels16_l2_mmi(dst, full_mid, half, stride, 16, 16, 16);
2512 void ff_avg_h264_qpel16_mc02_mmi(uint8_t *dst, const uint8_t *src,
2516 uint8_t * const full_mid= full + 32;
2517 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2518 avg_h264_qpel16_v_lowpass_mmi(dst, full_mid, stride, 16);
2521 void ff_avg_h264_qpel16_mc03_mmi(uint8_t *dst, const uint8_t *src,
2525 uint8_t * const full_mid= full + 32;
2527 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2528 put_h264_qpel16_v_lowpass_mmi(half, full_mid, 16, 16);
2529 avg_pixels16_l2_mmi(dst, full_mid+16, half, stride, 16, 16, 16);
2532 void ff_avg_h264_qpel16_mc11_mmi(uint8_t *dst, const uint8_t *src,
2536 uint8_t * const full_mid= full + 32;
2539 put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2540 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2541 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2542 avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2545 void ff_avg_h264_qpel16_mc31_mmi(uint8_t *dst, const uint8_t *src,
2549 uint8_t * const full_mid= full + 32;
2552 put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2553 copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21);
2554 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2555 avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2558 void ff_avg_h264_qpel16_mc13_mmi(uint8_t *dst, const uint8_t *src,
2562 uint8_t * const full_mid= full + 32;
2565 put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2566 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2567 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2568 avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2571 void ff_avg_h264_qpel16_mc33_mmi(uint8_t *dst, const uint8_t *src,
2575 uint8_t * const full_mid= full + 32;
2578 put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2579 copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21);
2580 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2581 avg_pixels16_l2_mmi(dst, halfH, halfV, stride, 16, 16, 16);
2584 void ff_avg_h264_qpel16_mc22_mmi(uint8_t *dst, const uint8_t *src,
2587 avg_h264_qpel16_hv_lowpass_mmi(dst, src, stride, stride);
2590 void ff_avg_h264_qpel16_mc21_mmi(uint8_t *dst, const uint8_t *src,
2594 uint8_t halfHV[256];
2595 put_h264_qpel16_h_lowpass_mmi(halfH, src, 16, stride);
2596 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2597 avg_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
2600 void ff_avg_h264_qpel16_mc23_mmi(uint8_t *dst, const uint8_t *src,
2604 uint8_t halfHV[256];
2605 put_h264_qpel16_h_lowpass_mmi(halfH, src + stride, 16, stride);
2606 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2607 avg_pixels16_l2_mmi(dst, halfH, halfHV, stride, 16, 16, 16);
2610 void ff_avg_h264_qpel16_mc12_mmi(uint8_t *dst, const uint8_t *src,
2614 uint8_t * const full_mid= full + 32;
2616 uint8_t halfHV[256];
2617 copy_block16_mmi(full, src - stride*2, 16, stride, 21);
2618 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2619 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2620 avg_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);
2623 void ff_avg_h264_qpel16_mc32_mmi(uint8_t *dst, const uint8_t *src,
2627 uint8_t * const full_mid= full + 32;
2629 uint8_t halfHV[256];
2630 copy_block16_mmi(full, src - stride*2 + 1, 16, stride, 21);
2631 put_h264_qpel16_v_lowpass_mmi(halfV, full_mid, 16, 16);
2632 put_h264_qpel16_hv_lowpass_mmi(halfHV, src, 16, stride);
2633 avg_pixels16_l2_mmi(dst, halfV, halfHV, stride, 16, 16, 16);