]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/hevcdsp_mmi.c
avcodec/mips: [loongson] optimize put_hevc_epel_bi_hv_8 with mmi.
[ffmpeg] / libavcodec / mips / hevcdsp_mmi.c
1 /*
2  * Copyright (c) 2019 Shiyou Yin (yinshiyou-hf@loongson.cn)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "libavcodec/hevcdec.h"
22 #include "libavcodec/bit_depth_template.c"
23 #include "libavcodec/mips/hevcdsp_mips.h"
24 #include "libavutil/mips/mmiutils.h"
25
26 #define PUT_HEVC_QPEL_HV(w, x_step, src_step, dst_step)                  \
27 void ff_hevc_put_hevc_qpel_hv##w##_8_mmi(int16_t *dst, uint8_t *_src,    \
28                                      ptrdiff_t _srcstride,               \
29                                      int height, intptr_t mx,            \
30                                      intptr_t my, int width)             \
31 {                                                                        \
32     int x, y;                                                            \
33     const int8_t *filter;                                                \
34     pixel *src = (pixel*)_src;                                           \
35     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                    \
36     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];         \
37     int16_t *tmp = tmp_array;                                            \
38     uint64_t ftmp[15];                                                   \
39     uint64_t rtmp[1];                                                    \
40                                                                          \
41     src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                        \
42     filter = ff_hevc_qpel_filters[mx - 1];                               \
43     x = x_step;                                                          \
44     y = height + QPEL_EXTRA;                                             \
45     __asm__ volatile(                                                    \
46         MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
47         "li           %[rtmp0],      0x08                       \n\t"    \
48         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
49         "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
50         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
51         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
52         "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
53         "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"    \
54                                                                          \
55         "1:                                                     \n\t"    \
56         "2:                                                     \n\t"    \
57         "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"    \
58         "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"    \
59         "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"    \
60         "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"    \
61         "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"    \
62         "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"    \
63         "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"    \
64         "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"    \
65         "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"    \
66         "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"    \
67         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
68         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
69         "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"    \
70         "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"    \
71         "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"    \
72         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
73         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
74         "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"    \
75         "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"    \
76         "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"    \
77         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
78         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
79         "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"    \
80         "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"    \
81         "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"    \
82         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"    \
83         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"    \
84         "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"    \
85         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
86                      %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])            \
87         "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
88         "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
89         "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
90         "gssdlc1      %[ftmp3],      0x07(%[tmp])               \n\t"    \
91         "gssdrc1      %[ftmp3],      0x00(%[tmp])               \n\t"    \
92                                                                          \
93         "daddi        %[x],          %[x],         -0x01        \n\t"    \
94         PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"    \
95         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"    \
96         "bnez         %[x],          2b                         \n\t"    \
97                                                                          \
98         "daddi        %[y],          %[y],         -0x01        \n\t"    \
99         "li           %[x],        " #x_step "                  \n\t"    \
100         PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"    \
101         PTR_ADDIU    "%[tmp],        %[tmp],     " #dst_step "  \n\t"    \
102         PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"    \
103         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
104         "bnez         %[y],          1b                         \n\t"    \
105         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
106           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
107           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
108           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
109           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
110           [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),                \
111           [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                 \
112           [x]"+&r"(x)                                                    \
113         : [filter]"r"(filter), [stride]"r"(srcstride)                    \
114         : "memory"                                                       \
115     );                                                                   \
116                                                                          \
117     tmp    = tmp_array + QPEL_EXTRA_BEFORE * 4 -12;                      \
118     filter = ff_hevc_qpel_filters[my - 1];                               \
119     x = x_step;                                                          \
120     y = height;                                                          \
121     __asm__ volatile(                                                    \
122         MMI_LDC1(%[ftmp1], %[filter], 0x00)                              \
123         "li           %[rtmp0],      0x08                       \n\t"    \
124         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
125         "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"    \
126         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"    \
127         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"    \
128         "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"    \
129         "li           %[rtmp0],      0x06                       \n\t"    \
130         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"    \
131                                                                          \
132         "1:                                                     \n\t"    \
133         "2:                                                     \n\t"    \
134         "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"    \
135         "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"    \
136         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
137         "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"    \
138         "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"    \
139         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
140         "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"    \
141         "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"    \
142         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
143         "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"    \
144         "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"    \
145         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
146         "gsldlc1      %[ftmp7],      0x07(%[tmp])               \n\t"    \
147         "gsldrc1      %[ftmp7],      0x00(%[tmp])               \n\t"    \
148         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
149         "gsldlc1      %[ftmp8],      0x07(%[tmp])               \n\t"    \
150         "gsldrc1      %[ftmp8],      0x00(%[tmp])               \n\t"    \
151         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
152         "gsldlc1      %[ftmp9],      0x07(%[tmp])               \n\t"    \
153         "gsldrc1      %[ftmp9],      0x00(%[tmp])               \n\t"    \
154         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
155         "gsldlc1      %[ftmp10],     0x07(%[tmp])               \n\t"    \
156         "gsldrc1      %[ftmp10],     0x00(%[tmp])               \n\t"    \
157         PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"    \
158         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],             \
159                      %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])         \
160         TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],            \
161                      %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])         \
162         "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"    \
163         "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"    \
164         "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"    \
165         "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"    \
166         "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"    \
167         "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"    \
168         TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])           \
169         "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"    \
170         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"    \
171         "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"    \
172         "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"    \
173         "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"    \
174         "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"    \
175         "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"    \
176         "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"    \
177         TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])           \
178         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"    \
179         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"    \
180         "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"    \
181         "gssdlc1      %[ftmp3],      0x07(%[dst])               \n\t"    \
182         "gssdrc1      %[ftmp3],      0x00(%[dst])               \n\t"    \
183                                                                          \
184         "daddi        %[x],          %[x],         -0x01        \n\t"    \
185         PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"    \
186         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"    \
187         "bnez         %[x],          2b                         \n\t"    \
188                                                                          \
189         "daddi        %[y],          %[y],         -0x01        \n\t"    \
190         "li           %[x],        " #x_step "                  \n\t"    \
191         PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"    \
192         PTR_ADDIU    "%[tmp],        %[tmp],     " #dst_step "  \n\t"    \
193         PTR_ADDIU    "%[dst],        %[dst],        0x80        \n\t"    \
194         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"    \
195         "bnez         %[y],          1b                         \n\t"    \
196         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                  \
197           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                  \
198           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                  \
199           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                  \
200           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                  \
201           [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),              \
202           [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),              \
203           [ftmp14]"=&f"(ftmp[14]), [rtmp0]"=&r"(rtmp[0]),                \
204           [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y),                 \
205           [x]"+&r"(x)                                                    \
206         : [filter]"r"(filter), [stride]"r"(srcstride)                    \
207         : "memory"                                                       \
208     );                                                                   \
209 }
210
211 PUT_HEVC_QPEL_HV(4, 1, -4, -8);
212 PUT_HEVC_QPEL_HV(8, 2, -8, -16);
213 PUT_HEVC_QPEL_HV(12, 3, -12, -24);
214 PUT_HEVC_QPEL_HV(16, 4, -16, -32);
215 PUT_HEVC_QPEL_HV(24, 6, -24, -48);
216 PUT_HEVC_QPEL_HV(32, 8, -32, -64);
217 PUT_HEVC_QPEL_HV(48, 12, -48, -96);
218 PUT_HEVC_QPEL_HV(64, 16, -64, -128);
219
220 #define PUT_HEVC_QPEL_BI_HV(w, x_step, src_step, src2_step, dst_step)   \
221 void ff_hevc_put_hevc_qpel_bi_hv##w##_8_mmi(uint8_t *_dst,              \
222                                             ptrdiff_t _dststride,       \
223                                             uint8_t *_src,              \
224                                             ptrdiff_t _srcstride,       \
225                                             int16_t *src2, int height,  \
226                                             intptr_t mx, intptr_t my,   \
227                                             int width)                  \
228 {                                                                       \
229     int x, y;                                                           \
230     const int8_t *filter;                                               \
231     pixel *src = (pixel*)_src;                                          \
232     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
233     pixel *dst          = (pixel *)_dst;                                \
234     ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
235     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];        \
236     int16_t *tmp = tmp_array;                                           \
237     uint64_t ftmp[20];                                                  \
238     uint64_t rtmp[1];                                                   \
239     int shift = 7;                                                      \
240     int offset = 64;                                                    \
241                                                                         \
242     src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                       \
243     filter = ff_hevc_qpel_filters[mx - 1];                              \
244     x = width >> 2;                                                     \
245     y = height + QPEL_EXTRA;                                            \
246     __asm__ volatile(                                                   \
247         MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
248         "li           %[rtmp0],      0x08                       \n\t"   \
249         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
250         "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
251         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
252         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
253         "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
254         "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
255                                                                         \
256         "1:                                                     \n\t"   \
257         "2:                                                     \n\t"   \
258         "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"   \
259         "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"   \
260         "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"   \
261         "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"   \
262         "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"   \
263         "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"   \
264         "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"   \
265         "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"   \
266         "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
267         "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
268         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
269         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
270         "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
271         "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
272         "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
273         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
274         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
275         "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
276         "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
277         "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
278         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
279         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
280         "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
281         "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
282         "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
283         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
284         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
285         "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
286         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
287                      %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
288         "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
289         "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
290         "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
291         "gssdlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
292         "gssdrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
293                                                                         \
294         "daddi        %[x],          %[x],         -0x01        \n\t"   \
295         PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
296         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
297         "bnez         %[x],          2b                         \n\t"   \
298                                                                         \
299         "daddi        %[y],          %[y],         -0x01        \n\t"   \
300         "li           %[x],        " #x_step "                  \n\t"   \
301         PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
302         PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
303         PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
304         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
305         "bnez         %[y],          1b                         \n\t"   \
306         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
307           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
308           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
309           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
310           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
311           [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),               \
312           [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
313           [x]"+&r"(x)                                                   \
314         : [filter]"r"(filter), [stride]"r"(srcstride)                   \
315         : "memory"                                                      \
316     );                                                                  \
317                                                                         \
318     tmp    = tmp_array;                                                 \
319     filter = ff_hevc_qpel_filters[my - 1];                              \
320     x = width >> 2;                                                     \
321     y = height;                                                         \
322     __asm__ volatile(                                                   \
323         MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
324         "li           %[rtmp0],      0x08                       \n\t"   \
325         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
326         "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
327         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
328         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
329         "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
330         "li           %[rtmp0],      0x06                       \n\t"   \
331         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
332         "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
333                                                                         \
334         "1:                                                     \n\t"   \
335         "li           %[x],        " #x_step "                  \n\t"   \
336         "2:                                                     \n\t"   \
337         "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
338         "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
339         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
340         "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"   \
341         "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"   \
342         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
343         "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"   \
344         "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"   \
345         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
346         "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"   \
347         "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"   \
348         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
349         "gsldlc1      %[ftmp7],      0x07(%[tmp])               \n\t"   \
350         "gsldrc1      %[ftmp7],      0x00(%[tmp])               \n\t"   \
351         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
352         "gsldlc1      %[ftmp8],      0x07(%[tmp])               \n\t"   \
353         "gsldrc1      %[ftmp8],      0x00(%[tmp])               \n\t"   \
354         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
355         "gsldlc1      %[ftmp9],      0x07(%[tmp])               \n\t"   \
356         "gsldrc1      %[ftmp9],      0x00(%[tmp])               \n\t"   \
357         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
358         "gsldlc1      %[ftmp10],     0x07(%[tmp])               \n\t"   \
359         "gsldrc1      %[ftmp10],     0x00(%[tmp])               \n\t"   \
360         PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"   \
361         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
362                      %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
363         TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],           \
364                      %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
365         "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"   \
366         "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"   \
367         "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"   \
368         "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"   \
369         "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
370         "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
371         TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])          \
372         "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
373         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
374         "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"   \
375         "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"   \
376         "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"   \
377         "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"   \
378         "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
379         "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
380         TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])          \
381         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
382         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
383         "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
384         "gsldlc1      %[ftmp4],      0x07(%[src2])              \n\t"   \
385         "gsldrc1      %[ftmp4],      0x00(%[src2])              \n\t"   \
386         "xor          %[ftmp7],      %[ftmp7],      %[ftmp7]    \n\t"   \
387         "li           %[rtmp0],      0x10                       \n\t"   \
388         "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
389         "punpcklhw    %[ftmp5],      %[ftmp7],      %[ftmp3]    \n\t"   \
390         "punpckhhw    %[ftmp6],      %[ftmp7],      %[ftmp3]    \n\t"   \
391         "punpckhhw    %[ftmp3],      %[ftmp7],      %[ftmp4]    \n\t"   \
392         "punpcklhw    %[ftmp4],      %[ftmp7],      %[ftmp4]    \n\t"   \
393         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
394         "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
395         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
396         "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
397         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
398         "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
399         "paddw        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"   \
400         "paddw        %[ftmp6],      %[ftmp6],      %[offset]   \n\t"   \
401         "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
402         "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
403         "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
404         "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp7]    \n\t"   \
405         "and          %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
406         "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
407         "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
408         "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
409                                                                         \
410         "daddi        %[x],          %[x],         -0x01        \n\t"   \
411         PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
412         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
413         PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
414         "bnez         %[x],          2b                         \n\t"   \
415                                                                         \
416         "daddi        %[y],          %[y],         -0x01        \n\t"   \
417         PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
418         PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
419         PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
420         PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
421         PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
422         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
423         "bnez         %[y],          1b                         \n\t"   \
424         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
425           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
426           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
427           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
428           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
429           [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
430           [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),             \
431           [ftmp14]"=&f"(ftmp[14]), [src2]"+&r"(src2),                   \
432           [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
433           [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
434         : [filter]"r"(filter), [stride]"r"(dststride),                  \
435           [shift]"f"(shift)                                             \
436         : "memory"                                                      \
437     );                                                                  \
438 }
439
440 PUT_HEVC_QPEL_BI_HV(4, 1, -4, -8, -4);
441 PUT_HEVC_QPEL_BI_HV(8, 2, -8, -16, -8);
442 PUT_HEVC_QPEL_BI_HV(12, 3, -12, -24, -12);
443 PUT_HEVC_QPEL_BI_HV(16, 4, -16, -32, -16);
444 PUT_HEVC_QPEL_BI_HV(24, 6, -24, -48, -24);
445 PUT_HEVC_QPEL_BI_HV(32, 8, -32, -64, -32);
446 PUT_HEVC_QPEL_BI_HV(48, 12, -48, -96, -48);
447 PUT_HEVC_QPEL_BI_HV(64, 16, -64, -128, -64);
448
449 #define PUT_HEVC_EPEL_BI_HV(w, x_step, src_step, src2_step, dst_step)   \
450 void ff_hevc_put_hevc_epel_bi_hv##w##_8_mmi(uint8_t *_dst,              \
451                                             ptrdiff_t _dststride,       \
452                                             uint8_t *_src,              \
453                                             ptrdiff_t _srcstride,       \
454                                             int16_t *src2, int height,  \
455                                             intptr_t mx, intptr_t my,   \
456                                             int width)                  \
457 {                                                                       \
458     int x, y;                                                           \
459     pixel *src = (pixel *)_src;                                         \
460     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
461     pixel *dst          = (pixel *)_dst;                                \
462     ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
463     const int8_t *filter = ff_hevc_epel_filters[mx - 1];                \
464     int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];        \
465     int16_t *tmp = tmp_array;                                           \
466     uint64_t ftmp[12];                                                  \
467     uint64_t rtmp[1];                                                   \
468     int shift = 7;                                                      \
469     int offset = 64;                                                    \
470                                                                         \
471     src -= (EPEL_EXTRA_BEFORE * srcstride + 1);                         \
472     x = width >> 2;                                                     \
473     y = height + EPEL_EXTRA;                                            \
474     __asm__ volatile(                                                   \
475         MMI_LWC1(%[ftmp1], %[filter], 0x00)                             \
476         "li           %[rtmp0],      0x08                       \n\t"   \
477         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
478         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
479         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
480         "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
481                                                                         \
482         "1:                                                     \n\t"   \
483         "2:                                                     \n\t"   \
484         "gslwlc1      %[ftmp2],      0x03(%[src])               \n\t"   \
485         "gslwrc1      %[ftmp2],      0x00(%[src])               \n\t"   \
486         "gslwlc1      %[ftmp3],      0x04(%[src])               \n\t"   \
487         "gslwrc1      %[ftmp3],      0x01(%[src])               \n\t"   \
488         "gslwlc1      %[ftmp4],      0x05(%[src])               \n\t"   \
489         "gslwrc1      %[ftmp4],      0x02(%[src])               \n\t"   \
490         "gslwlc1      %[ftmp5],      0x06(%[src])               \n\t"   \
491         "gslwrc1      %[ftmp5],      0x03(%[src])               \n\t"   \
492         "punpcklbh    %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
493         "pmullh       %[ftmp2],      %[ftmp2],      %[ftmp1]    \n\t"   \
494         "punpcklbh    %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
495         "pmullh       %[ftmp3],      %[ftmp3],      %[ftmp1]    \n\t"   \
496         "punpcklbh    %[ftmp4],      %[ftmp4],      %[ftmp0]    \n\t"   \
497         "pmullh       %[ftmp4],      %[ftmp4],      %[ftmp1]    \n\t"   \
498         "punpcklbh    %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
499         "pmullh       %[ftmp5],      %[ftmp5],      %[ftmp1]    \n\t"   \
500         TRANSPOSE_4H(%[ftmp2], %[ftmp3], %[ftmp4], %[ftmp5],            \
501                      %[ftmp6], %[ftmp7], %[ftmp8], %[ftmp9])            \
502         "paddh        %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"   \
503         "paddh        %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"   \
504         "paddh        %[ftmp2],      %[ftmp2],      %[ftmp4]    \n\t"   \
505         "gssdlc1      %[ftmp2],      0x07(%[tmp])               \n\t"   \
506         "gssdrc1      %[ftmp2],      0x00(%[tmp])               \n\t"   \
507                                                                         \
508         "daddi        %[x],          %[x],         -0x01        \n\t"   \
509         PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
510         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
511         "bnez         %[x],          2b                         \n\t"   \
512                                                                         \
513         "daddi        %[y],          %[y],         -0x01        \n\t"   \
514         "li           %[x],        " #x_step "                  \n\t"   \
515         PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
516         PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
517         PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
518         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
519         "bnez         %[y],          1b                         \n\t"   \
520         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
521           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
522           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
523           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
524           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
525           [rtmp0]"=&r"(rtmp[0]),                                        \
526           [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
527           [x]"+&r"(x)                                                   \
528         : [filter]"r"(filter), [stride]"r"(srcstride)                   \
529         : "memory"                                                      \
530     );                                                                  \
531                                                                         \
532     tmp      = tmp_array;                                               \
533     filter = ff_hevc_epel_filters[my - 1];                              \
534     x = width >> 2;                                                     \
535     y = height;                                                         \
536     __asm__ volatile(                                                   \
537         MMI_LWC1(%[ftmp1], %[filter], 0x00)                             \
538         "li           %[rtmp0],      0x08                       \n\t"   \
539         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
540         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
541         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
542         "li           %[rtmp0],      0x06                       \n\t"   \
543         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
544         "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
545         "xor          %[ftmp2],      %[ftmp2],      %[ftmp2]    \n\t"   \
546                                                                         \
547         "1:                                                     \n\t"   \
548         "li           %[x],        " #x_step "                  \n\t"   \
549         "2:                                                     \n\t"   \
550         "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
551         "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
552         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
553         "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"   \
554         "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"   \
555         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
556         "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"   \
557         "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"   \
558         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
559         "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"   \
560         "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"   \
561         PTR_ADDIU    "%[tmp],        %[tmp],       -0x180       \n\t"   \
562         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
563                      %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
564         "pmaddhw      %[ftmp7],      %[ftmp3],      %[ftmp1]    \n\t"   \
565         "pmaddhw      %[ftmp8],      %[ftmp4],      %[ftmp1]    \n\t"   \
566         TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp3], %[ftmp4])            \
567         "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
568         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
569         "pmaddhw      %[ftmp7],      %[ftmp5],      %[ftmp1]    \n\t"   \
570         "pmaddhw      %[ftmp8],      %[ftmp6],      %[ftmp1]    \n\t"   \
571         TRANSPOSE_2W(%[ftmp7], %[ftmp8], %[ftmp5], %[ftmp6])            \
572         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
573         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
574         "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
575         "gsldlc1      %[ftmp4],      0x07(%[src2])              \n\t"   \
576         "gsldrc1      %[ftmp4],      0x00(%[src2])              \n\t"   \
577         "li           %[rtmp0],      0x10                       \n\t"   \
578         "dmtc1        %[rtmp0],      %[ftmp8]                   \n\t"   \
579         "punpcklhw    %[ftmp5],      %[ftmp2],      %[ftmp3]    \n\t"   \
580         "punpckhhw    %[ftmp6],      %[ftmp2],      %[ftmp3]    \n\t"   \
581         "punpckhhw    %[ftmp3],      %[ftmp2],      %[ftmp4]    \n\t"   \
582         "punpcklhw    %[ftmp4],      %[ftmp2],      %[ftmp4]    \n\t"   \
583         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp8]    \n\t"   \
584         "psraw        %[ftmp6],      %[ftmp6],      %[ftmp8]    \n\t"   \
585         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp8]    \n\t"   \
586         "psraw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"   \
587         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp4]    \n\t"   \
588         "paddw        %[ftmp6],      %[ftmp6],      %[ftmp3]    \n\t"   \
589         "paddw        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"   \
590         "paddw        %[ftmp6],      %[ftmp6],      %[offset]   \n\t"   \
591         "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"   \
592         "psraw        %[ftmp6],      %[ftmp6],      %[shift]    \n\t"   \
593         "packsswh     %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
594         "pcmpgth      %[ftmp7],      %[ftmp5],      %[ftmp2]    \n\t"   \
595         "and          %[ftmp3],      %[ftmp5],      %[ftmp7]    \n\t"   \
596         "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
597         "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
598         "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
599                                                                         \
600         "daddi        %[x],          %[x],         -0x01        \n\t"   \
601         PTR_ADDIU    "%[src2],       %[src2],       0x08        \n\t"   \
602         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
603         PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
604         "bnez         %[x],          2b                         \n\t"   \
605                                                                         \
606         "daddi        %[y],          %[y],         -0x01        \n\t"   \
607         PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"   \
608         PTR_ADDIU    "%[tmp],        %[tmp],     " #src2_step " \n\t"   \
609         PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
610         PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"   \
611         PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
612         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
613         "bnez         %[y],          1b                         \n\t"   \
614         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
615           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
616           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
617           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
618           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
619           [ftmp10]"=&f"(ftmp[10]), [src2]"+&r"(src2),                   \
620           [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
621           [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
622         : [filter]"r"(filter), [stride]"r"(dststride),                  \
623           [shift]"f"(shift)                                             \
624         : "memory"                                                      \
625     );                                                                  \
626 }
627
628 PUT_HEVC_EPEL_BI_HV(4, 1, -4, -8, -4);
629 PUT_HEVC_EPEL_BI_HV(8, 2, -8, -16, -8);
630 PUT_HEVC_EPEL_BI_HV(12, 3, -12, -24, -12);
631 PUT_HEVC_EPEL_BI_HV(16, 4, -16, -32, -16);
632 PUT_HEVC_EPEL_BI_HV(24, 6, -24, -48, -24);
633 PUT_HEVC_EPEL_BI_HV(32, 8, -32, -64, -32);
634
635 #define PUT_HEVC_PEL_BI_PIXELS(w, x_step, src_step, dst_step, src2_step)  \
636 void ff_hevc_put_hevc_pel_bi_pixels##w##_8_mmi(uint8_t *_dst,             \
637                                                ptrdiff_t _dststride,      \
638                                                uint8_t *_src,             \
639                                                ptrdiff_t _srcstride,      \
640                                                int16_t *src2, int height, \
641                                                intptr_t mx, intptr_t my,  \
642                                                int width)                 \
643 {                                                                         \
644     int x, y;                                                             \
645     pixel *src          = (pixel *)_src;                                  \
646     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                     \
647     pixel *dst          = (pixel *)_dst;                                  \
648     ptrdiff_t dststride = _dststride / sizeof(pixel);                     \
649     uint64_t ftmp[12];                                                    \
650     uint64_t rtmp[1];                                                     \
651     int shift = 7;                                                        \
652                                                                           \
653     y = height;                                                           \
654     x = width >> 3;                                                       \
655     __asm__ volatile(                                                     \
656         "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"     \
657         "li           %[rtmp0],      0x06                       \n\t"     \
658         "dmtc1        %[rtmp0],      %[ftmp1]                   \n\t"     \
659         "li           %[rtmp0],      0x10                       \n\t"     \
660         "dmtc1        %[rtmp0],      %[ftmp10]                  \n\t"     \
661         "li           %[rtmp0],      0x40                       \n\t"     \
662         "dmtc1        %[rtmp0],      %[offset]                  \n\t"     \
663         "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"     \
664         "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"     \
665                                                                           \
666         "1:                                                     \n\t"     \
667         "2:                                                     \n\t"     \
668         "gsldlc1      %[ftmp5],      0x07(%[src])               \n\t"     \
669         "gsldrc1      %[ftmp5],      0x00(%[src])               \n\t"     \
670         "gsldlc1      %[ftmp2],      0x07(%[src2])              \n\t"     \
671         "gsldrc1      %[ftmp2],      0x00(%[src2])              \n\t"     \
672         "gsldlc1      %[ftmp3],      0x0f(%[src2])              \n\t"     \
673         "gsldrc1      %[ftmp3],      0x08(%[src2])              \n\t"     \
674         "punpcklbh    %[ftmp4],      %[ftmp5],      %[ftmp0]    \n\t"     \
675         "punpckhbh    %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"     \
676         "psllh        %[ftmp4],      %[ftmp4],      %[ftmp1]    \n\t"     \
677         "psllh        %[ftmp5],      %[ftmp5],      %[ftmp1]    \n\t"     \
678         "paddh        %[ftmp4],      %[ftmp4],      %[offset]   \n\t"     \
679         "paddh        %[ftmp5],      %[ftmp5],      %[offset]   \n\t"     \
680         "punpcklhw    %[ftmp6],      %[ftmp4],      %[ftmp0]    \n\t"     \
681         "punpckhhw    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"     \
682         "punpcklhw    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"     \
683         "punpckhhw    %[ftmp9],      %[ftmp5],      %[ftmp0]    \n\t"     \
684         "punpcklhw    %[ftmp4],      %[ftmp0],      %[ftmp3]    \n\t"     \
685         "punpckhhw    %[ftmp5],      %[ftmp0],      %[ftmp3]    \n\t"     \
686         "punpckhhw    %[ftmp3],      %[ftmp0],      %[ftmp2]    \n\t"     \
687         "punpcklhw    %[ftmp2],      %[ftmp0],      %[ftmp2]    \n\t"     \
688         "psraw        %[ftmp2],      %[ftmp2],      %[ftmp10]   \n\t"     \
689         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp10]   \n\t"     \
690         "psraw        %[ftmp4],      %[ftmp4],      %[ftmp10]   \n\t"     \
691         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp10]   \n\t"     \
692         "paddw        %[ftmp2],      %[ftmp2],      %[ftmp6]    \n\t"     \
693         "paddw        %[ftmp3],      %[ftmp3],      %[ftmp7]    \n\t"     \
694         "paddw        %[ftmp4],      %[ftmp4],      %[ftmp8]    \n\t"     \
695         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp9]    \n\t"     \
696         "psraw        %[ftmp2],      %[ftmp2],      %[shift]    \n\t"     \
697         "psraw        %[ftmp3],      %[ftmp3],      %[shift]    \n\t"     \
698         "psraw        %[ftmp4],      %[ftmp4],      %[shift]    \n\t"     \
699         "psraw        %[ftmp5],      %[ftmp5],      %[shift]    \n\t"     \
700         "packsswh     %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"     \
701         "packsswh     %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"     \
702         "pcmpgth      %[ftmp3],      %[ftmp2],      %[ftmp0]    \n\t"     \
703         "pcmpgth      %[ftmp5],      %[ftmp4],      %[ftmp0]    \n\t"     \
704         "and          %[ftmp2],      %[ftmp2],      %[ftmp3]    \n\t"     \
705         "and          %[ftmp4],      %[ftmp4],      %[ftmp5]    \n\t"     \
706         "packushb     %[ftmp2],      %[ftmp2],      %[ftmp4]    \n\t"     \
707         "gssdlc1      %[ftmp2],      0x07(%[dst])               \n\t"     \
708         "gssdrc1      %[ftmp2],      0x00(%[dst])               \n\t"     \
709                                                                           \
710         "daddi        %[x],          %[x],         -0x01        \n\t"     \
711         PTR_ADDIU    "%[src],        %[src],        0x08        \n\t"     \
712         PTR_ADDIU    "%[dst],        %[dst],        0x08        \n\t"     \
713         PTR_ADDIU    "%[src2],       %[src2],       0x10        \n\t"     \
714         "bnez         %[x],          2b                         \n\t"     \
715                                                                           \
716         PTR_ADDIU    "%[src],        %[src],     " #src_step "  \n\t"     \
717         PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"     \
718         PTR_ADDIU    "%[src2],       %[src2],    " #src2_step " \n\t"     \
719         "li           %[x],        " #x_step "                  \n\t"     \
720         "daddi        %[y],          %[y],         -0x01        \n\t"     \
721         PTR_ADDU     "%[src],        %[src],       %[srcstride] \n\t"     \
722         PTR_ADDU     "%[dst],        %[dst],       %[dststride] \n\t"     \
723         PTR_ADDIU    "%[src2],       %[src2],       0x80        \n\t"     \
724         "bnez         %[y],          1b                         \n\t"     \
725         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                   \
726           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                   \
727           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                   \
728           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                   \
729           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                   \
730           [ftmp10]"=&f"(ftmp[10]), [offset]"=&f"(ftmp[11]),               \
731           [src2]"+&r"(src2), [dst]"+&r"(dst), [src]"+&r"(src),            \
732           [x]"+&r"(x), [y]"+&r"(y), [rtmp0]"=&r"(rtmp[0])                 \
733         : [dststride]"r"(dststride), [shift]"f"(shift),                   \
734           [srcstride]"r"(srcstride)                                       \
735         : "memory"                                                        \
736     );                                                                    \
737 }                                                                         \
738
739 PUT_HEVC_PEL_BI_PIXELS(8, 1, -8, -8, -16);
740 PUT_HEVC_PEL_BI_PIXELS(16, 2, -16, -16, -32);
741 PUT_HEVC_PEL_BI_PIXELS(24, 3, -24, -24, -48);
742 PUT_HEVC_PEL_BI_PIXELS(32, 4, -32, -32, -64);
743 PUT_HEVC_PEL_BI_PIXELS(48, 6, -48, -48, -96);
744 PUT_HEVC_PEL_BI_PIXELS(64, 8, -64, -64, -128);
745
746 #define PUT_HEVC_QPEL_UNI_HV(w, x_step, src_step, dst_step, tmp_step)   \
747 void ff_hevc_put_hevc_qpel_uni_hv##w##_8_mmi(uint8_t *_dst,             \
748                                              ptrdiff_t _dststride,      \
749                                              uint8_t *_src,             \
750                                              ptrdiff_t _srcstride,      \
751                                              int height,                \
752                                              intptr_t mx, intptr_t my,  \
753                                              int width)                 \
754 {                                                                       \
755     int x, y;                                                           \
756     const int8_t *filter;                                               \
757     pixel *src = (pixel*)_src;                                          \
758     ptrdiff_t srcstride = _srcstride / sizeof(pixel);                   \
759     pixel *dst          = (pixel *)_dst;                                \
760     ptrdiff_t dststride = _dststride / sizeof(pixel);                   \
761     int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];        \
762     int16_t *tmp = tmp_array;                                           \
763     uint64_t ftmp[20];                                                  \
764     uint64_t rtmp[1];                                                   \
765     int shift = 6;                                                      \
766     int offset = 32;                                                    \
767                                                                         \
768     src   -= (QPEL_EXTRA_BEFORE * srcstride + 3);                       \
769     filter = ff_hevc_qpel_filters[mx - 1];                              \
770     x = width >> 2;                                                     \
771     y = height + QPEL_EXTRA;                                            \
772     __asm__ volatile(                                                   \
773         MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
774         "li           %[rtmp0],      0x08                       \n\t"   \
775         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
776         "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
777         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
778         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
779         "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
780         "xor          %[ftmp0],      %[ftmp0],      %[ftmp0]    \n\t"   \
781                                                                         \
782         "1:                                                     \n\t"   \
783         "2:                                                     \n\t"   \
784         "gsldlc1      %[ftmp3],      0x07(%[src])               \n\t"   \
785         "gsldrc1      %[ftmp3],      0x00(%[src])               \n\t"   \
786         "gsldlc1      %[ftmp4],      0x08(%[src])               \n\t"   \
787         "gsldrc1      %[ftmp4],      0x01(%[src])               \n\t"   \
788         "gsldlc1      %[ftmp5],      0x09(%[src])               \n\t"   \
789         "gsldrc1      %[ftmp5],      0x02(%[src])               \n\t"   \
790         "gsldlc1      %[ftmp6],      0x0a(%[src])               \n\t"   \
791         "gsldrc1      %[ftmp6],      0x03(%[src])               \n\t"   \
792         "punpcklbh    %[ftmp7],      %[ftmp3],      %[ftmp0]    \n\t"   \
793         "punpckhbh    %[ftmp8],      %[ftmp3],      %[ftmp0]    \n\t"   \
794         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
795         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
796         "paddh        %[ftmp3],      %[ftmp7],      %[ftmp8]    \n\t"   \
797         "punpcklbh    %[ftmp7],      %[ftmp4],      %[ftmp0]    \n\t"   \
798         "punpckhbh    %[ftmp8],      %[ftmp4],      %[ftmp0]    \n\t"   \
799         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
800         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
801         "paddh        %[ftmp4],      %[ftmp7],      %[ftmp8]    \n\t"   \
802         "punpcklbh    %[ftmp7],      %[ftmp5],      %[ftmp0]    \n\t"   \
803         "punpckhbh    %[ftmp8],      %[ftmp5],      %[ftmp0]    \n\t"   \
804         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
805         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
806         "paddh        %[ftmp5],      %[ftmp7],      %[ftmp8]    \n\t"   \
807         "punpcklbh    %[ftmp7],      %[ftmp6],      %[ftmp0]    \n\t"   \
808         "punpckhbh    %[ftmp8],      %[ftmp6],      %[ftmp0]    \n\t"   \
809         "pmullh       %[ftmp7],      %[ftmp7],      %[ftmp1]    \n\t"   \
810         "pmullh       %[ftmp8],      %[ftmp8],      %[ftmp2]    \n\t"   \
811         "paddh        %[ftmp6],      %[ftmp7],      %[ftmp8]    \n\t"   \
812         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
813                      %[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10])           \
814         "paddh        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
815         "paddh        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
816         "paddh        %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
817         "gssdlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
818         "gssdrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
819                                                                         \
820         "daddi        %[x],          %[x],         -0x01        \n\t"   \
821         PTR_ADDIU    "%[src],        %[src],        0x04        \n\t"   \
822         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
823         "bnez         %[x],          2b                         \n\t"   \
824                                                                         \
825         "daddi        %[y],          %[y],         -0x01        \n\t"   \
826         "li           %[x],        " #x_step "                  \n\t"   \
827         PTR_ADDIU    "%[src],        %[src],      " #src_step " \n\t"   \
828         PTR_ADDIU    "%[tmp],        %[tmp],      " #tmp_step " \n\t"   \
829         PTR_ADDU     "%[src],        %[src],        %[stride]   \n\t"   \
830         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
831         "bnez         %[y],          1b                         \n\t"   \
832         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
833           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
834           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
835           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
836           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
837           [ftmp10]"=&f"(ftmp[10]), [rtmp0]"=&r"(rtmp[0]),               \
838           [src]"+&r"(src), [tmp]"+&r"(tmp), [y]"+&r"(y),                \
839           [x]"+&r"(x)                                                   \
840         : [filter]"r"(filter), [stride]"r"(srcstride)                   \
841         : "memory"                                                      \
842     );                                                                  \
843                                                                         \
844     tmp    = tmp_array;                                                 \
845     filter = ff_hevc_qpel_filters[my - 1];                              \
846     x = width >> 2;                                                     \
847     y = height;                                                         \
848     __asm__ volatile(                                                   \
849         MMI_LDC1(%[ftmp1], %[filter], 0x00)                             \
850         "li           %[rtmp0],      0x08                       \n\t"   \
851         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
852         "punpckhbh    %[ftmp2],      %[ftmp0],      %[ftmp1]    \n\t"   \
853         "punpcklbh    %[ftmp1],      %[ftmp0],      %[ftmp1]    \n\t"   \
854         "psrah        %[ftmp1],      %[ftmp1],      %[ftmp0]    \n\t"   \
855         "psrah        %[ftmp2],      %[ftmp2],      %[ftmp0]    \n\t"   \
856         "li           %[rtmp0],      0x06                       \n\t"   \
857         "dmtc1        %[rtmp0],      %[ftmp0]                   \n\t"   \
858         "punpcklhw    %[offset],     %[offset],     %[offset]   \n\t"   \
859         "punpcklwd    %[offset],     %[offset],     %[offset]   \n\t"   \
860                                                                         \
861         "1:                                                     \n\t"   \
862         "li           %[x],        " #x_step "                  \n\t"   \
863         "2:                                                     \n\t"   \
864         "gsldlc1      %[ftmp3],      0x07(%[tmp])               \n\t"   \
865         "gsldrc1      %[ftmp3],      0x00(%[tmp])               \n\t"   \
866         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
867         "gsldlc1      %[ftmp4],      0x07(%[tmp])               \n\t"   \
868         "gsldrc1      %[ftmp4],      0x00(%[tmp])               \n\t"   \
869         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
870         "gsldlc1      %[ftmp5],      0x07(%[tmp])               \n\t"   \
871         "gsldrc1      %[ftmp5],      0x00(%[tmp])               \n\t"   \
872         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
873         "gsldlc1      %[ftmp6],      0x07(%[tmp])               \n\t"   \
874         "gsldrc1      %[ftmp6],      0x00(%[tmp])               \n\t"   \
875         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
876         "gsldlc1      %[ftmp7],      0x07(%[tmp])               \n\t"   \
877         "gsldrc1      %[ftmp7],      0x00(%[tmp])               \n\t"   \
878         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
879         "gsldlc1      %[ftmp8],      0x07(%[tmp])               \n\t"   \
880         "gsldrc1      %[ftmp8],      0x00(%[tmp])               \n\t"   \
881         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
882         "gsldlc1      %[ftmp9],      0x07(%[tmp])               \n\t"   \
883         "gsldrc1      %[ftmp9],      0x00(%[tmp])               \n\t"   \
884         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
885         "gsldlc1      %[ftmp10],     0x07(%[tmp])               \n\t"   \
886         "gsldrc1      %[ftmp10],     0x00(%[tmp])               \n\t"   \
887         PTR_ADDIU    "%[tmp],        %[tmp],        -0x380      \n\t"   \
888         TRANSPOSE_4H(%[ftmp3], %[ftmp4], %[ftmp5], %[ftmp6],            \
889                      %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
890         TRANSPOSE_4H(%[ftmp7], %[ftmp8], %[ftmp9], %[ftmp10],           \
891                      %[ftmp11], %[ftmp12], %[ftmp13], %[ftmp14])        \
892         "pmaddhw      %[ftmp11],     %[ftmp3],      %[ftmp1]    \n\t"   \
893         "pmaddhw      %[ftmp12],     %[ftmp7],      %[ftmp2]    \n\t"   \
894         "pmaddhw      %[ftmp13],     %[ftmp4],      %[ftmp1]    \n\t"   \
895         "pmaddhw      %[ftmp14],     %[ftmp8],      %[ftmp2]    \n\t"   \
896         "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
897         "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
898         TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp3], %[ftmp4])          \
899         "paddw        %[ftmp3],      %[ftmp3],      %[ftmp4]    \n\t"   \
900         "psraw        %[ftmp3],      %[ftmp3],      %[ftmp0]    \n\t"   \
901         "pmaddhw      %[ftmp11],     %[ftmp5],      %[ftmp1]    \n\t"   \
902         "pmaddhw      %[ftmp12],     %[ftmp9],      %[ftmp2]    \n\t"   \
903         "pmaddhw      %[ftmp13],     %[ftmp6],      %[ftmp1]    \n\t"   \
904         "pmaddhw      %[ftmp14],     %[ftmp10],     %[ftmp2]    \n\t"   \
905         "paddw        %[ftmp11],     %[ftmp11],     %[ftmp12]   \n\t"   \
906         "paddw        %[ftmp13],     %[ftmp13],     %[ftmp14]   \n\t"   \
907         TRANSPOSE_2W(%[ftmp11], %[ftmp13], %[ftmp5], %[ftmp6])          \
908         "paddw        %[ftmp5],      %[ftmp5],      %[ftmp6]    \n\t"   \
909         "psraw        %[ftmp5],      %[ftmp5],      %[ftmp0]    \n\t"   \
910         "packsswh     %[ftmp3],      %[ftmp3],      %[ftmp5]    \n\t"   \
911         "paddh        %[ftmp3],      %[ftmp3],      %[offset]   \n\t"   \
912         "psrah        %[ftmp3],      %[ftmp3],      %[shift]    \n\t"   \
913         "xor          %[ftmp7],      %[ftmp7],      %[ftmp7]    \n\t"   \
914         "pcmpgth      %[ftmp7],      %[ftmp3],      %[ftmp7]    \n\t"   \
915         "and          %[ftmp3],      %[ftmp3],      %[ftmp7]    \n\t"   \
916         "packushb     %[ftmp3],      %[ftmp3],      %[ftmp3]    \n\t"   \
917         "gsswlc1      %[ftmp3],      0x03(%[dst])               \n\t"   \
918         "gsswrc1      %[ftmp3],      0x00(%[dst])               \n\t"   \
919                                                                         \
920         "daddi        %[x],          %[x],         -0x01        \n\t"   \
921         PTR_ADDIU    "%[tmp],        %[tmp],        0x08        \n\t"   \
922         PTR_ADDIU    "%[dst],        %[dst],        0x04        \n\t"   \
923         "bnez         %[x],          2b                         \n\t"   \
924                                                                         \
925         "daddi        %[y],          %[y],         -0x01        \n\t"   \
926         PTR_ADDIU    "%[tmp],        %[tmp],     " #tmp_step "  \n\t"   \
927         PTR_ADDIU    "%[dst],        %[dst],     " #dst_step "  \n\t"   \
928         PTR_ADDU     "%[dst],        %[dst],        %[stride]   \n\t"   \
929         PTR_ADDIU    "%[tmp],        %[tmp],        0x80        \n\t"   \
930         "bnez         %[y],          1b                         \n\t"   \
931         : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),                 \
932           [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),                 \
933           [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),                 \
934           [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),                 \
935           [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),                 \
936           [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),             \
937           [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),             \
938           [ftmp14]"=&f"(ftmp[14]),                                      \
939           [dst]"+&r"(dst), [tmp]"+&r"(tmp), [y]"+&r"(y), [x]"=&r"(x),   \
940           [offset]"+&f"(offset), [rtmp0]"=&r"(rtmp[0])                  \
941         : [filter]"r"(filter), [stride]"r"(dststride),                  \
942           [shift]"f"(shift)                                             \
943         : "memory"                                                      \
944     );                                                                  \
945 }
946
947 PUT_HEVC_QPEL_UNI_HV(4, 1, -4, -4, -8);
948 PUT_HEVC_QPEL_UNI_HV(8, 2, -8, -8, -16);
949 PUT_HEVC_QPEL_UNI_HV(12, 3, -12, -12, -24);
950 PUT_HEVC_QPEL_UNI_HV(16, 4, -16, -16, -32);
951 PUT_HEVC_QPEL_UNI_HV(24, 6, -24, -24, -48);
952 PUT_HEVC_QPEL_UNI_HV(32, 8, -32, -32, -64);
953 PUT_HEVC_QPEL_UNI_HV(48, 12, -48, -48, -96);
954 PUT_HEVC_QPEL_UNI_HV(64, 16, -64, -64, -128);