2 * Copyright (c) 2019 Shiyou Yin (yinshiyou-hf@loongson.cn)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavcodec/hevcdec.h"
22 #include "libavcodec/bit_depth_template.c"
23 #include "libavcodec/mips/hevcdsp_mips.h"
24 #include "libavutil/mips/mmiutils.h"
26 #define PUT_HEVC_QPEL_HV(w, x_step, src_step, dst_step) \
27 void ff_hevc_put_hevc_qpel_hv##w##_8_mmi(int16_t *dst, uint8_t *_src, \
28 ptrdiff_t _srcstride, \
29 int height, intptr_t mx, \
30 intptr_t my, int width) \
33 const int8_t *filter; \
34 pixel *src = (pixel*)_src; \
35 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
36 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \
37 int16_t *tmp = tmp_array; \
41 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \
42 filter = ff_hevc_qpel_filters[mx - 1]; \
44 y = height + QPEL_EXTRA; \
46 MMI_LDC1(%[ftmp1], %[filter], 0x00) \
47 "li %[rtmp0], 0x08 \n\t" \
48 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
49 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \
50 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
51 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
52 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
53 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
57 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \
58 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \
59 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \
60 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \
61 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \
62 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \
63 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \
64 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \
65 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
66 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
67 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
68 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
69 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \
70 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \
71 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
72 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
73 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
74 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \
75 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
76 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
77 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
78 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
79 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \
80 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
81 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
82 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
83 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
84 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \
85 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
86 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \
87 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
88 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
89 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
90 "gssdlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
91 "gssdrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
93 "daddi %[x], %[x], -0x01 \n\t" \
94 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \
95 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
96 "bnez %[x], 2b \n\t" \
98 "daddi %[y], %[y], -0x01 \n\t" \
99 "li %[x], " #x_step " \n\t" \
100 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \
101 PTR_ADDIU "%[tmp], %[tmp], " #dst_step " \n\t" \
102 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
103 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
104 "bnez %[y], 1b \n\t" \
105 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
106 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
107 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
108 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
109 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
110 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \
111 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \
113 : [filter]"r"(filter), [stride]"r"(srcstride) \
117 tmp = tmp_array + QPEL_EXTRA_BEFORE * 4 -12; \
118 filter = ff_hevc_qpel_filters[my - 1]; \
122 MMI_LDC1(%[ftmp1], %[filter], 0x00) \
123 "li %[rtmp0], 0x08 \n\t" \
124 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
125 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \
126 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
127 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
128 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
129 "li %[rtmp0], 0x06 \n\t" \
130 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
134 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
135 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
136 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
137 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \
138 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \
139 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
140 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \
141 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \
142 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
143 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \
144 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \
145 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
146 "gsldlc1 %[ftmp7], 0x07(%[tmp]) \n\t" \
147 "gsldrc1 %[ftmp7], 0x00(%[tmp]) \n\t" \
148 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
149 "gsldlc1 %[ftmp8], 0x07(%[tmp]) \n\t" \
150 "gsldrc1 %[ftmp8], 0x00(%[tmp]) \n\t" \
151 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
152 "gsldlc1 %[ftmp9], 0x07(%[tmp]) \n\t" \
153 "gsldrc1 %[ftmp9], 0x00(%[tmp]) \n\t" \
154 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
155 "gsldlc1 %[ftmp10], 0x07(%[tmp]) \n\t" \
156 "gsldrc1 %[ftmp10], 0x00(%[tmp]) \n\t" \
157 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \
158 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
159 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \
160 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \
161 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \
162 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \
163 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \
164 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \
165 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \
166 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \
167 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \
168 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \
169 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
170 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
171 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \
172 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \
173 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \
174 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \
175 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \
176 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \
177 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \
178 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
179 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \
180 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
181 "gssdlc1 %[ftmp3], 0x07(%[dst]) \n\t" \
182 "gssdrc1 %[ftmp3], 0x00(%[dst]) \n\t" \
184 "daddi %[x], %[x], -0x01 \n\t" \
185 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \
186 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
187 "bnez %[x], 2b \n\t" \
189 "daddi %[y], %[y], -0x01 \n\t" \
190 "li %[x], " #x_step " \n\t" \
191 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \
192 PTR_ADDIU "%[tmp], %[tmp], " #dst_step " \n\t" \
193 PTR_ADDIU "%[dst], %[dst], 0x80 \n\t" \
194 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
195 "bnez %[y], 1b \n\t" \
196 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
197 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
198 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
199 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
200 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
201 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \
202 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \
203 [ftmp14]"=&f"(ftmp[14]), [rtmp0]"=&r"(rtmp[0]), \
204 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), \
206 : [filter]"r"(filter), [stride]"r"(srcstride) \
211 PUT_HEVC_QPEL_HV(4, 1, -4, -8);
212 PUT_HEVC_QPEL_HV(8, 2, -8, -16);
213 PUT_HEVC_QPEL_HV(12, 3, -12, -24);
214 PUT_HEVC_QPEL_HV(16, 4, -16, -32);
215 PUT_HEVC_QPEL_HV(24, 6, -24, -48);
216 PUT_HEVC_QPEL_HV(32, 8, -32, -64);
217 PUT_HEVC_QPEL_HV(48, 12, -48, -96);
218 PUT_HEVC_QPEL_HV(64, 16, -64, -128);
220 #define PUT_HEVC_QPEL_BI_HV(w, x_step, src_step, src2_step, dst_step) \
221 void ff_hevc_put_hevc_qpel_bi_hv##w##_8_mmi(uint8_t *_dst, \
222 ptrdiff_t _dststride, \
224 ptrdiff_t _srcstride, \
225 int16_t *src2, int height, \
226 intptr_t mx, intptr_t my, \
230 const int8_t *filter; \
231 pixel *src = (pixel*)_src; \
232 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
233 pixel *dst = (pixel *)_dst; \
234 ptrdiff_t dststride = _dststride / sizeof(pixel); \
235 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \
236 int16_t *tmp = tmp_array; \
242 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \
243 filter = ff_hevc_qpel_filters[mx - 1]; \
245 y = height + QPEL_EXTRA; \
247 MMI_LDC1(%[ftmp1], %[filter], 0x00) \
248 "li %[rtmp0], 0x08 \n\t" \
249 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
250 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \
251 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
252 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
253 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
254 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
258 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \
259 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \
260 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \
261 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \
262 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \
263 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \
264 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \
265 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \
266 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
267 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
268 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
269 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
270 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \
271 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \
272 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
273 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
274 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
275 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \
276 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
277 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
278 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
279 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
280 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \
281 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
282 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
283 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
284 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
285 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \
286 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
287 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \
288 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
289 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
290 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
291 "gssdlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
292 "gssdrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
294 "daddi %[x], %[x], -0x01 \n\t" \
295 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \
296 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
297 "bnez %[x], 2b \n\t" \
299 "daddi %[y], %[y], -0x01 \n\t" \
300 "li %[x], " #x_step " \n\t" \
301 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \
302 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \
303 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
304 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
305 "bnez %[y], 1b \n\t" \
306 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
307 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
308 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
309 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
310 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
311 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \
312 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \
314 : [filter]"r"(filter), [stride]"r"(srcstride) \
319 filter = ff_hevc_qpel_filters[my - 1]; \
323 MMI_LDC1(%[ftmp1], %[filter], 0x00) \
324 "li %[rtmp0], 0x08 \n\t" \
325 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
326 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \
327 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
328 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
329 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
330 "li %[rtmp0], 0x06 \n\t" \
331 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
332 "punpcklwd %[offset], %[offset], %[offset] \n\t" \
335 "li %[x], " #x_step " \n\t" \
337 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
338 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
339 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
340 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \
341 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \
342 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
343 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \
344 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \
345 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
346 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \
347 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \
348 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
349 "gsldlc1 %[ftmp7], 0x07(%[tmp]) \n\t" \
350 "gsldrc1 %[ftmp7], 0x00(%[tmp]) \n\t" \
351 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
352 "gsldlc1 %[ftmp8], 0x07(%[tmp]) \n\t" \
353 "gsldrc1 %[ftmp8], 0x00(%[tmp]) \n\t" \
354 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
355 "gsldlc1 %[ftmp9], 0x07(%[tmp]) \n\t" \
356 "gsldrc1 %[ftmp9], 0x00(%[tmp]) \n\t" \
357 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
358 "gsldlc1 %[ftmp10], 0x07(%[tmp]) \n\t" \
359 "gsldrc1 %[ftmp10], 0x00(%[tmp]) \n\t" \
360 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \
361 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
362 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \
363 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \
364 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \
365 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \
366 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \
367 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \
368 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \
369 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \
370 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \
371 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \
372 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
373 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
374 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \
375 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \
376 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \
377 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \
378 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \
379 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \
380 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \
381 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
382 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \
383 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
384 "gsldlc1 %[ftmp4], 0x07(%[src2]) \n\t" \
385 "gsldrc1 %[ftmp4], 0x00(%[src2]) \n\t" \
386 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
387 "li %[rtmp0], 0x10 \n\t" \
388 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \
389 "punpcklhw %[ftmp5], %[ftmp7], %[ftmp3] \n\t" \
390 "punpckhhw %[ftmp6], %[ftmp7], %[ftmp3] \n\t" \
391 "punpckhhw %[ftmp3], %[ftmp7], %[ftmp4] \n\t" \
392 "punpcklhw %[ftmp4], %[ftmp7], %[ftmp4] \n\t" \
393 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \
394 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
395 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \
396 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \
397 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
398 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
399 "paddw %[ftmp5], %[ftmp5], %[offset] \n\t" \
400 "paddw %[ftmp6], %[ftmp6], %[offset] \n\t" \
401 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \
402 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \
403 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
404 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp7] \n\t" \
405 "and %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \
406 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
407 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \
408 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \
410 "daddi %[x], %[x], -0x01 \n\t" \
411 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \
412 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
413 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \
414 "bnez %[x], 2b \n\t" \
416 "daddi %[y], %[y], -0x01 \n\t" \
417 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \
418 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \
419 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \
420 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \
421 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
422 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
423 "bnez %[y], 1b \n\t" \
424 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
425 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
426 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
427 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
428 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
429 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \
430 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \
431 [ftmp14]"=&f"(ftmp[14]), [src2]"+&r"(src2), \
432 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \
433 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \
434 : [filter]"r"(filter), [stride]"r"(dststride), \
440 PUT_HEVC_QPEL_BI_HV(4, 1, -4, -8, -4);
441 PUT_HEVC_QPEL_BI_HV(8, 2, -8, -16, -8);
442 PUT_HEVC_QPEL_BI_HV(12, 3, -12, -24, -12);
443 PUT_HEVC_QPEL_BI_HV(16, 4, -16, -32, -16);
444 PUT_HEVC_QPEL_BI_HV(24, 6, -24, -48, -24);
445 PUT_HEVC_QPEL_BI_HV(32, 8, -32, -64, -32);
446 PUT_HEVC_QPEL_BI_HV(48, 12, -48, -96, -48);
447 PUT_HEVC_QPEL_BI_HV(64, 16, -64, -128, -64);
449 #define PUT_HEVC_EPEL_BI_HV(w, x_step, src_step, src2_step, dst_step) \
450 void ff_hevc_put_hevc_epel_bi_hv##w##_8_mmi(uint8_t *_dst, \
451 ptrdiff_t _dststride, \
453 ptrdiff_t _srcstride, \
454 int16_t *src2, int height, \
455 intptr_t mx, intptr_t my, \
459 pixel *src = (pixel *)_src; \
460 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
461 pixel *dst = (pixel *)_dst; \
462 ptrdiff_t dststride = _dststride / sizeof(pixel); \
463 const int8_t *filter = ff_hevc_epel_filters[mx - 1]; \
464 int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; \
465 int16_t *tmp = tmp_array; \
471 src -= (EPEL_EXTRA_BEFORE * srcstride + 1); \
473 y = height + EPEL_EXTRA; \
475 MMI_LWC1(%[ftmp1], %[filter], 0x00) \
476 "li %[rtmp0], 0x08 \n\t" \
477 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
478 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
479 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
480 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
484 "gslwlc1 %[ftmp2], 0x03(%[src]) \n\t" \
485 "gslwrc1 %[ftmp2], 0x00(%[src]) \n\t" \
486 "gslwlc1 %[ftmp3], 0x04(%[src]) \n\t" \
487 "gslwrc1 %[ftmp3], 0x01(%[src]) \n\t" \
488 "gslwlc1 %[ftmp4], 0x05(%[src]) \n\t" \
489 "gslwrc1 %[ftmp4], 0x02(%[src]) \n\t" \
490 "gslwlc1 %[ftmp5], 0x06(%[src]) \n\t" \
491 "gslwrc1 %[ftmp5], 0x03(%[src]) \n\t" \
492 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
493 "pmullh %[ftmp2], %[ftmp2], %[ftmp1] \n\t" \
494 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
495 "pmullh %[ftmp3], %[ftmp3], %[ftmp1] \n\t" \
496 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
497 "pmullh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
498 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \
499 "pmullh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
500 TRANSPOSE_4H(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5], \
501 %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9]) \
502 "paddh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
503 "paddh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
504 "paddh %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
505 "gssdlc1 %[ftmp2], 0x07(%[tmp]) \n\t" \
506 "gssdrc1 %[ftmp2], 0x00(%[tmp]) \n\t" \
508 "daddi %[x], %[x], -0x01 \n\t" \
509 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \
510 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
511 "bnez %[x], 2b \n\t" \
513 "daddi %[y], %[y], -0x01 \n\t" \
514 "li %[x], " #x_step " \n\t" \
515 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \
516 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \
517 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
518 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
519 "bnez %[y], 1b \n\t" \
520 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
521 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
522 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
523 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
524 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
525 [rtmp0]"=&r"(rtmp[0]), \
526 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \
528 : [filter]"r"(filter), [stride]"r"(srcstride) \
533 filter = ff_hevc_epel_filters[my - 1]; \
537 MMI_LWC1(%[ftmp1], %[filter], 0x00) \
538 "li %[rtmp0], 0x08 \n\t" \
539 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
540 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
541 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
542 "li %[rtmp0], 0x06 \n\t" \
543 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
544 "punpcklwd %[offset], %[offset], %[offset] \n\t" \
545 "xor %[ftmp2], %[ftmp2], %[ftmp2] \n\t" \
548 "li %[x], " #x_step " \n\t" \
550 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
551 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
552 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
553 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \
554 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \
555 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
556 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \
557 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \
558 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
559 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \
560 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \
561 PTR_ADDIU "%[tmp], %[tmp], -0x180 \n\t" \
562 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
563 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \
564 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t" \
565 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t" \
566 TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp3], %[ftmp4]) \
567 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
568 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
569 "pmaddhw %[ftmp7], %[ftmp5], %[ftmp1] \n\t" \
570 "pmaddhw %[ftmp8], %[ftmp6], %[ftmp1] \n\t" \
571 TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp5], %[ftmp6]) \
572 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
573 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \
574 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
575 "gsldlc1 %[ftmp4], 0x07(%[src2]) \n\t" \
576 "gsldrc1 %[ftmp4], 0x00(%[src2]) \n\t" \
577 "li %[rtmp0], 0x10 \n\t" \
578 "dmtc1 %[rtmp0], %[ftmp8] \n\t" \
579 "punpcklhw %[ftmp5], %[ftmp2], %[ftmp3] \n\t" \
580 "punpckhhw %[ftmp6], %[ftmp2], %[ftmp3] \n\t" \
581 "punpckhhw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
582 "punpcklhw %[ftmp4], %[ftmp2], %[ftmp4] \n\t" \
583 "psraw %[ftmp5], %[ftmp5], %[ftmp8] \n\t" \
584 "psraw %[ftmp6], %[ftmp6], %[ftmp8] \n\t" \
585 "psraw %[ftmp3], %[ftmp3], %[ftmp8] \n\t" \
586 "psraw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \
587 "paddw %[ftmp5], %[ftmp5], %[ftmp4] \n\t" \
588 "paddw %[ftmp6], %[ftmp6], %[ftmp3] \n\t" \
589 "paddw %[ftmp5], %[ftmp5], %[offset] \n\t" \
590 "paddw %[ftmp6], %[ftmp6], %[offset] \n\t" \
591 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \
592 "psraw %[ftmp6], %[ftmp6], %[shift] \n\t" \
593 "packsswh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
594 "pcmpgth %[ftmp7], %[ftmp5], %[ftmp2] \n\t" \
595 "and %[ftmp3], %[ftmp5], %[ftmp7] \n\t" \
596 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
597 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \
598 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \
600 "daddi %[x], %[x], -0x01 \n\t" \
601 PTR_ADDIU "%[src2], %[src2], 0x08 \n\t" \
602 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
603 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \
604 "bnez %[x], 2b \n\t" \
606 "daddi %[y], %[y], -0x01 \n\t" \
607 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \
608 PTR_ADDIU "%[tmp], %[tmp], " #src2_step " \n\t" \
609 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \
610 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \
611 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
612 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
613 "bnez %[y], 1b \n\t" \
614 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
615 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
616 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
617 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
618 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
619 [ftmp10]"=&f"(ftmp[10]), [src2]"+&r"(src2), \
620 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \
621 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \
622 : [filter]"r"(filter), [stride]"r"(dststride), \
628 PUT_HEVC_EPEL_BI_HV(4, 1, -4, -8, -4);
629 PUT_HEVC_EPEL_BI_HV(8, 2, -8, -16, -8);
630 PUT_HEVC_EPEL_BI_HV(12, 3, -12, -24, -12);
631 PUT_HEVC_EPEL_BI_HV(16, 4, -16, -32, -16);
632 PUT_HEVC_EPEL_BI_HV(24, 6, -24, -48, -24);
633 PUT_HEVC_EPEL_BI_HV(32, 8, -32, -64, -32);
635 #define PUT_HEVC_PEL_BI_PIXELS(w, x_step, src_step, dst_step, src2_step) \
636 void ff_hevc_put_hevc_pel_bi_pixels##w##_8_mmi(uint8_t *_dst, \
637 ptrdiff_t _dststride, \
639 ptrdiff_t _srcstride, \
640 int16_t *src2, int height, \
641 intptr_t mx, intptr_t my, \
645 pixel *src = (pixel *)_src; \
646 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
647 pixel *dst = (pixel *)_dst; \
648 ptrdiff_t dststride = _dststride / sizeof(pixel); \
656 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
657 "li %[rtmp0], 0x06 \n\t" \
658 "dmtc1 %[rtmp0], %[ftmp1] \n\t" \
659 "li %[rtmp0], 0x10 \n\t" \
660 "dmtc1 %[rtmp0], %[ftmp10] \n\t" \
661 "li %[rtmp0], 0x40 \n\t" \
662 "dmtc1 %[rtmp0], %[offset] \n\t" \
663 "punpcklhw %[offset], %[offset], %[offset] \n\t" \
664 "punpcklwd %[offset], %[offset], %[offset] \n\t" \
668 "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" \
669 "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" \
670 "gsldlc1 %[ftmp2], 0x07(%[src2]) \n\t" \
671 "gsldrc1 %[ftmp2], 0x00(%[src2]) \n\t" \
672 "gsldlc1 %[ftmp3], 0x0f(%[src2]) \n\t" \
673 "gsldrc1 %[ftmp3], 0x08(%[src2]) \n\t" \
674 "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" \
675 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \
676 "psllh %[ftmp4], %[ftmp4], %[ftmp1] \n\t" \
677 "psllh %[ftmp5], %[ftmp5], %[ftmp1] \n\t" \
678 "paddh %[ftmp4], %[ftmp4], %[offset] \n\t" \
679 "paddh %[ftmp5], %[ftmp5], %[offset] \n\t" \
680 "punpcklhw %[ftmp6], %[ftmp4], %[ftmp0] \n\t" \
681 "punpckhhw %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \
682 "punpcklhw %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
683 "punpckhhw %[ftmp9], %[ftmp5], %[ftmp0] \n\t" \
684 "punpcklhw %[ftmp4], %[ftmp0], %[ftmp3] \n\t" \
685 "punpckhhw %[ftmp5], %[ftmp0], %[ftmp3] \n\t" \
686 "punpckhhw %[ftmp3], %[ftmp0], %[ftmp2] \n\t" \
687 "punpcklhw %[ftmp2], %[ftmp0], %[ftmp2] \n\t" \
688 "psraw %[ftmp2], %[ftmp2], %[ftmp10] \n\t" \
689 "psraw %[ftmp3], %[ftmp3], %[ftmp10] \n\t" \
690 "psraw %[ftmp4], %[ftmp4], %[ftmp10] \n\t" \
691 "psraw %[ftmp5], %[ftmp5], %[ftmp10] \n\t" \
692 "paddw %[ftmp2], %[ftmp2], %[ftmp6] \n\t" \
693 "paddw %[ftmp3], %[ftmp3], %[ftmp7] \n\t" \
694 "paddw %[ftmp4], %[ftmp4], %[ftmp8] \n\t" \
695 "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
696 "psraw %[ftmp2], %[ftmp2], %[shift] \n\t" \
697 "psraw %[ftmp3], %[ftmp3], %[shift] \n\t" \
698 "psraw %[ftmp4], %[ftmp4], %[shift] \n\t" \
699 "psraw %[ftmp5], %[ftmp5], %[shift] \n\t" \
700 "packsswh %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
701 "packsswh %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
702 "pcmpgth %[ftmp3], %[ftmp2], %[ftmp0] \n\t" \
703 "pcmpgth %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \
704 "and %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
705 "and %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
706 "packushb %[ftmp2], %[ftmp2], %[ftmp4] \n\t" \
707 "gssdlc1 %[ftmp2], 0x07(%[dst]) \n\t" \
708 "gssdrc1 %[ftmp2], 0x00(%[dst]) \n\t" \
710 "daddi %[x], %[x], -0x01 \n\t" \
711 PTR_ADDIU "%[src], %[src], 0x08 \n\t" \
712 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t" \
713 PTR_ADDIU "%[src2], %[src2], 0x10 \n\t" \
714 "bnez %[x], 2b \n\t" \
716 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \
717 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \
718 PTR_ADDIU "%[src2], %[src2], " #src2_step " \n\t" \
719 "li %[x], " #x_step " \n\t" \
720 "daddi %[y], %[y], -0x01 \n\t" \
721 PTR_ADDU "%[src], %[src], %[srcstride] \n\t" \
722 PTR_ADDU "%[dst], %[dst], %[dststride] \n\t" \
723 PTR_ADDIU "%[src2], %[src2], 0x80 \n\t" \
724 "bnez %[y], 1b \n\t" \
725 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
726 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
727 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
728 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
729 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
730 [ftmp10]"=&f"(ftmp[10]), [offset]"=&f"(ftmp[11]), \
731 [src2]"+&r"(src2), [dst]"+&r"(dst), [src]"+&r"(src), \
732 [x]"+&r"(x), [y]"+&r"(y), [rtmp0]"=&r"(rtmp[0]) \
733 : [dststride]"r"(dststride), [shift]"f"(shift), \
734 [srcstride]"r"(srcstride) \
739 PUT_HEVC_PEL_BI_PIXELS(8, 1, -8, -8, -16);
740 PUT_HEVC_PEL_BI_PIXELS(16, 2, -16, -16, -32);
741 PUT_HEVC_PEL_BI_PIXELS(24, 3, -24, -24, -48);
742 PUT_HEVC_PEL_BI_PIXELS(32, 4, -32, -32, -64);
743 PUT_HEVC_PEL_BI_PIXELS(48, 6, -48, -48, -96);
744 PUT_HEVC_PEL_BI_PIXELS(64, 8, -64, -64, -128);
746 #define PUT_HEVC_QPEL_UNI_HV(w, x_step, src_step, dst_step, tmp_step) \
747 void ff_hevc_put_hevc_qpel_uni_hv##w##_8_mmi(uint8_t *_dst, \
748 ptrdiff_t _dststride, \
750 ptrdiff_t _srcstride, \
752 intptr_t mx, intptr_t my, \
756 const int8_t *filter; \
757 pixel *src = (pixel*)_src; \
758 ptrdiff_t srcstride = _srcstride / sizeof(pixel); \
759 pixel *dst = (pixel *)_dst; \
760 ptrdiff_t dststride = _dststride / sizeof(pixel); \
761 int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; \
762 int16_t *tmp = tmp_array; \
768 src -= (QPEL_EXTRA_BEFORE * srcstride + 3); \
769 filter = ff_hevc_qpel_filters[mx - 1]; \
771 y = height + QPEL_EXTRA; \
773 MMI_LDC1(%[ftmp1], %[filter], 0x00) \
774 "li %[rtmp0], 0x08 \n\t" \
775 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
776 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \
777 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
778 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
779 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
780 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
784 "gsldlc1 %[ftmp3], 0x07(%[src]) \n\t" \
785 "gsldrc1 %[ftmp3], 0x00(%[src]) \n\t" \
786 "gsldlc1 %[ftmp4], 0x08(%[src]) \n\t" \
787 "gsldrc1 %[ftmp4], 0x01(%[src]) \n\t" \
788 "gsldlc1 %[ftmp5], 0x09(%[src]) \n\t" \
789 "gsldrc1 %[ftmp5], 0x02(%[src]) \n\t" \
790 "gsldlc1 %[ftmp6], 0x0a(%[src]) \n\t" \
791 "gsldrc1 %[ftmp6], 0x03(%[src]) \n\t" \
792 "punpcklbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
793 "punpckhbh %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
794 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
795 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
796 "paddh %[ftmp3], %[ftmp7], %[ftmp8] \n\t" \
797 "punpcklbh %[ftmp7], %[ftmp4], %[ftmp0] \n\t" \
798 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
799 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
800 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
801 "paddh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" \
802 "punpcklbh %[ftmp7], %[ftmp5], %[ftmp0] \n\t" \
803 "punpckhbh %[ftmp8], %[ftmp5], %[ftmp0] \n\t" \
804 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
805 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
806 "paddh %[ftmp5], %[ftmp7], %[ftmp8] \n\t" \
807 "punpcklbh %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
808 "punpckhbh %[ftmp8], %[ftmp6], %[ftmp0] \n\t" \
809 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t" \
810 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t" \
811 "paddh %[ftmp6], %[ftmp7], %[ftmp8] \n\t" \
812 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
813 %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10]) \
814 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
815 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
816 "paddh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
817 "gssdlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
818 "gssdrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
820 "daddi %[x], %[x], -0x01 \n\t" \
821 PTR_ADDIU "%[src], %[src], 0x04 \n\t" \
822 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
823 "bnez %[x], 2b \n\t" \
825 "daddi %[y], %[y], -0x01 \n\t" \
826 "li %[x], " #x_step " \n\t" \
827 PTR_ADDIU "%[src], %[src], " #src_step " \n\t" \
828 PTR_ADDIU "%[tmp], %[tmp], " #tmp_step " \n\t" \
829 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
830 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
831 "bnez %[y], 1b \n\t" \
832 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
833 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
834 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
835 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
836 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
837 [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]), \
838 [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y), \
840 : [filter]"r"(filter), [stride]"r"(srcstride) \
845 filter = ff_hevc_qpel_filters[my - 1]; \
849 MMI_LDC1(%[ftmp1], %[filter], 0x00) \
850 "li %[rtmp0], 0x08 \n\t" \
851 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
852 "punpckhbh %[ftmp2], %[ftmp0], %[ftmp1] \n\t" \
853 "punpcklbh %[ftmp1], %[ftmp0], %[ftmp1] \n\t" \
854 "psrah %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
855 "psrah %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
856 "li %[rtmp0], 0x06 \n\t" \
857 "dmtc1 %[rtmp0], %[ftmp0] \n\t" \
858 "punpcklhw %[offset], %[offset], %[offset] \n\t" \
859 "punpcklwd %[offset], %[offset], %[offset] \n\t" \
862 "li %[x], " #x_step " \n\t" \
864 "gsldlc1 %[ftmp3], 0x07(%[tmp]) \n\t" \
865 "gsldrc1 %[ftmp3], 0x00(%[tmp]) \n\t" \
866 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
867 "gsldlc1 %[ftmp4], 0x07(%[tmp]) \n\t" \
868 "gsldrc1 %[ftmp4], 0x00(%[tmp]) \n\t" \
869 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
870 "gsldlc1 %[ftmp5], 0x07(%[tmp]) \n\t" \
871 "gsldrc1 %[ftmp5], 0x00(%[tmp]) \n\t" \
872 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
873 "gsldlc1 %[ftmp6], 0x07(%[tmp]) \n\t" \
874 "gsldrc1 %[ftmp6], 0x00(%[tmp]) \n\t" \
875 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
876 "gsldlc1 %[ftmp7], 0x07(%[tmp]) \n\t" \
877 "gsldrc1 %[ftmp7], 0x00(%[tmp]) \n\t" \
878 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
879 "gsldlc1 %[ftmp8], 0x07(%[tmp]) \n\t" \
880 "gsldrc1 %[ftmp8], 0x00(%[tmp]) \n\t" \
881 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
882 "gsldlc1 %[ftmp9], 0x07(%[tmp]) \n\t" \
883 "gsldrc1 %[ftmp9], 0x00(%[tmp]) \n\t" \
884 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
885 "gsldlc1 %[ftmp10], 0x07(%[tmp]) \n\t" \
886 "gsldrc1 %[ftmp10], 0x00(%[tmp]) \n\t" \
887 PTR_ADDIU "%[tmp], %[tmp], -0x380 \n\t" \
888 TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6], \
889 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \
890 TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10], \
891 %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14]) \
892 "pmaddhw %[ftmp11], %[ftmp3], %[ftmp1] \n\t" \
893 "pmaddhw %[ftmp12], %[ftmp7], %[ftmp2] \n\t" \
894 "pmaddhw %[ftmp13], %[ftmp4], %[ftmp1] \n\t" \
895 "pmaddhw %[ftmp14], %[ftmp8], %[ftmp2] \n\t" \
896 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \
897 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \
898 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4]) \
899 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
900 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
901 "pmaddhw %[ftmp11], %[ftmp5], %[ftmp1] \n\t" \
902 "pmaddhw %[ftmp12], %[ftmp9], %[ftmp2] \n\t" \
903 "pmaddhw %[ftmp13], %[ftmp6], %[ftmp1] \n\t" \
904 "pmaddhw %[ftmp14], %[ftmp10], %[ftmp2] \n\t" \
905 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" \
906 "paddw %[ftmp13], %[ftmp13], %[ftmp14] \n\t" \
907 TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6]) \
908 "paddw %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
909 "psraw %[ftmp5], %[ftmp5], %[ftmp0] \n\t" \
910 "packsswh %[ftmp3], %[ftmp3], %[ftmp5] \n\t" \
911 "paddh %[ftmp3], %[ftmp3], %[offset] \n\t" \
912 "psrah %[ftmp3], %[ftmp3], %[shift] \n\t" \
913 "xor %[ftmp7], %[ftmp7], %[ftmp7] \n\t" \
914 "pcmpgth %[ftmp7], %[ftmp3], %[ftmp7] \n\t" \
915 "and %[ftmp3], %[ftmp3], %[ftmp7] \n\t" \
916 "packushb %[ftmp3], %[ftmp3], %[ftmp3] \n\t" \
917 "gsswlc1 %[ftmp3], 0x03(%[dst]) \n\t" \
918 "gsswrc1 %[ftmp3], 0x00(%[dst]) \n\t" \
920 "daddi %[x], %[x], -0x01 \n\t" \
921 PTR_ADDIU "%[tmp], %[tmp], 0x08 \n\t" \
922 PTR_ADDIU "%[dst], %[dst], 0x04 \n\t" \
923 "bnez %[x], 2b \n\t" \
925 "daddi %[y], %[y], -0x01 \n\t" \
926 PTR_ADDIU "%[tmp], %[tmp], " #tmp_step " \n\t" \
927 PTR_ADDIU "%[dst], %[dst], " #dst_step " \n\t" \
928 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
929 PTR_ADDIU "%[tmp], %[tmp], 0x80 \n\t" \
930 "bnez %[y], 1b \n\t" \
931 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), \
932 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), \
933 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), \
934 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), \
935 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), \
936 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), \
937 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]), \
938 [ftmp14]"=&f"(ftmp[14]), \
939 [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x), \
940 [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0]) \
941 : [filter]"r"(filter), [stride]"r"(dststride), \
947 PUT_HEVC_QPEL_UNI_HV(4, 1, -4, -4, -8);
948 PUT_HEVC_QPEL_UNI_HV(8, 2, -8, -8, -16);
949 PUT_HEVC_QPEL_UNI_HV(12, 3, -12, -12, -24);
950 PUT_HEVC_QPEL_UNI_HV(16, 4, -16, -16, -32);
951 PUT_HEVC_QPEL_UNI_HV(24, 6, -24, -24, -48);
952 PUT_HEVC_QPEL_UNI_HV(32, 8, -32, -32, -64);
953 PUT_HEVC_QPEL_UNI_HV(48, 12, -48, -48, -96);
954 PUT_HEVC_QPEL_UNI_HV(64, 16, -64, -64, -128);