]> git.sesse.net Git - ffmpeg/blob - libavcodec/mips/hpeldsp_mmi.c
Merge commit '0bad254300356005af4aef00a706bf2e8eee96bc'
[ffmpeg] / libavcodec / mips / hpeldsp_mmi.c
1 /*
2  * Loongson SIMD optimized qpeldsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include "hpeldsp_mips.h"
25 #include "libavcodec/bit_depth_template.c"
26 #include "libavutil/mips/asmdefs.h"
27 #include "constants.h"
28
29 void ff_put_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
30     ptrdiff_t line_size, int h)
31 {
32     double ftmp[2];
33     mips_reg addr[2];
34     uint64_t low32;
35
36     __asm__ volatile (
37         PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
38         "1:                                                             \n\t"
39         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
40         "uld        %[low32],   0x00(%[pixels])                         \n\t"
41         "mtc1       %[low32],   %[ftmp0]                                \n\t"
42         "uld        %[low32],   0x00(%[addr0])                          \n\t"
43         "mtc1       %[low32],   %[ftmp1]                                \n\t"
44         "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
45         "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
46         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
47         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
48
49         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
50         "uld        %[low32],   0x00(%[pixels])                         \n\t"
51         "mtc1       %[low32],   %[ftmp0]                                \n\t"
52         "uld        %[low32],   0x00(%[addr0])                          \n\t"
53         "mtc1       %[low32],   %[ftmp1]                                \n\t"
54         "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
55         "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
56         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
57         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
58
59         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
60         "bnez       %[h],       1b                                      \n\t"
61         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
62           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
63           [low32]"=&r"(low32),
64           [block]"+&r"(block),              [pixels]"+&r"(pixels),
65           [h]"+&r"(h)
66         : [line_size]"r"((mips_reg)line_size)
67         : "memory"
68     );
69 }
70
71 void ff_put_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
72     ptrdiff_t line_size, int h)
73 {
74     double ftmp[2];
75     mips_reg addr[2];
76
77     __asm__ volatile (
78         PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
79         "1:                                                             \n\t"
80         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
81         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
82         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
83         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
84         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
85         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
86         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
87         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
88         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
89
90         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
91         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
92         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
93         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
94         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
95         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
96         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
97         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
98         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
99
100         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
101         "bnez       %[h],       1b                                      \n\t"
102         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
103           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
104           [block]"+&r"(block),              [pixels]"+&r"(pixels),
105           [h]"+&r"(h)
106         : [line_size]"r"((mips_reg)line_size)
107         : "memory"
108     );
109 }
110
111 void ff_put_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
112     ptrdiff_t line_size, int h)
113 {
114     double ftmp[4];
115     mips_reg addr[2];
116
117     __asm__ volatile (
118         PTR_ADDU   "%[addr1],   %[line_size],   %[line_size]            \n\t"
119         "1:                                                             \n\t"
120         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
121         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
122         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
123         "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
124         "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
125         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
126         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
127         "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
128         "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
129         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
130         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
131         "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
132         "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
133         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
134         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
135
136         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
137         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
138         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
139         "gsldlc1    %[ftmp2],   0x0f(%[pixels])                         \n\t"
140         "gsldrc1    %[ftmp2],   0x08(%[pixels])                         \n\t"
141         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
142         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
143         "gsldlc1    %[ftmp3],   0x0f(%[addr0])                          \n\t"
144         "gsldrc1    %[ftmp3],   0x08(%[addr0])                          \n\t"
145         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
146         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
147         "sdc1       %[ftmp2],   0x08(%[block])                          \n\t"
148         "gssdxc1    %[ftmp3],   0x08(%[block],  %[line_size])           \n\t"
149         PTR_ADDU   "%[pixels],  %[pixels],      %[addr1]                \n\t"
150         PTR_ADDU   "%[block],   %[block],       %[addr1]                \n\t"
151
152         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
153         "bnez       %[h],       1b                                      \n\t"
154         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
155           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
156           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
157           [block]"+&r"(block),              [pixels]"+&r"(pixels),
158           [h]"+&r"(h)
159         : [line_size]"r"((mips_reg)line_size)
160         : "memory"
161     );
162 }
163
164 void ff_avg_pixels4_8_mmi(uint8_t *block, const uint8_t *pixels,
165     ptrdiff_t line_size, int h)
166 {
167     double ftmp[4];
168     mips_reg addr[3];
169     uint64_t low32;
170
171     __asm__ volatile (
172         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
173         "1:                                                             \n\t"
174         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
175         "uld        %[low32],   0x00(%[pixels])                         \n\t"
176         "mtc1       %[low32],   %[ftmp0]                                \n\t"
177         "uld        %[low32],   0x00(%[addr0])                          \n\t"
178         "mtc1       %[low32],   %[ftmp1]                                \n\t"
179         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
180         "uld        %[low32],   0x00(%[block])                          \n\t"
181         "mtc1       %[low32],   %[ftmp2]                                \n\t"
182         "uld        %[low32],   0x00(%[addr1])                          \n\t"
183         "mtc1       %[low32],   %[ftmp3]                                \n\t"
184         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
185         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
186         "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
187         "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
188         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
189         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
190
191         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
192         "uld        %[low32],   0x00(%[pixels])                         \n\t"
193         "mtc1       %[low32],   %[ftmp0]                                \n\t"
194         "uld        %[low32],   0x00(%[addr0])                          \n\t"
195         "mtc1       %[low32],   %[ftmp1]                                \n\t"
196         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
197         "uld        %[low32],   0x00(%[block])                          \n\t"
198         "mtc1       %[low32],   %[ftmp2]                                \n\t"
199         "uld        %[low32],   0x00(%[addr1])                          \n\t"
200         "mtc1       %[low32],   %[ftmp3]                                \n\t"
201         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
202         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
203         "swc1       %[ftmp0],   0x00(%[block])                          \n\t"
204         "gsswxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
205         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
206         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
207
208         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
209         "bnez       %[h],       1b                                      \n\t"
210         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
211           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
212           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
213           [addr2]"=&r"(addr[2]),
214           [low32]"=&r"(low32),
215           [block]"+&r"(block),              [pixels]"+&r"(pixels),
216           [h]"+&r"(h)
217         : [line_size]"r"((mips_reg)line_size)
218         : "memory"
219     );
220 }
221
222 void ff_avg_pixels8_8_mmi(uint8_t *block, const uint8_t *pixels,
223     ptrdiff_t line_size, int h)
224 {
225     double ftmp[4];
226     mips_reg addr[3];
227
228     __asm__ volatile (
229         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
230         "1:                                                             \n\t"
231         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
232         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
233         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
234         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
235         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
236         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
237         "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
238         "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
239         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
240         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
241         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
242         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
243         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
244         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
245         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
246         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
247
248         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
249         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
250         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
251         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
252         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
253         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
254         "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
255         "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
256         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
257         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
258         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
259         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
260         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
261         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
262         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
263         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
264
265         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
266         "bnez       %[h],       1b                                      \n\t"
267         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
268           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
269           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
270           [addr2]"=&r"(addr[2]),
271           [block]"+&r"(block),              [pixels]"+&r"(pixels),
272           [h]"+&r"(h)
273         : [line_size]"r"((mips_reg)line_size)
274         : "memory"
275     );
276 }
277
278 void ff_avg_pixels16_8_mmi(uint8_t *block, const uint8_t *pixels,
279     ptrdiff_t line_size, int h)
280 {
281     double ftmp[8];
282     mips_reg addr[3];
283
284     __asm__ volatile (
285         PTR_ADDU   "%[addr2],   %[line_size],   %[line_size]            \n\t"
286         "1:                                                             \n\t"
287         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
288         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
289         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
290         "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
291         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
292         "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
293         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
294         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
295         "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
296         "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
297         "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
298         "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
299         "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
300         "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
301         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
302         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
303         "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
304         "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
305         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
306         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
307         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
308         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
309         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
310         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
311         "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
312         "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
313         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
314         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
315
316         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
317         PTR_ADDU   "%[addr0],   %[pixels],      %[line_size]            \n\t"
318         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
319         "gsldlc1    %[ftmp4],   0x0f(%[pixels])                         \n\t"
320         PTR_ADDU   "%[addr1],   %[block],       %[line_size]            \n\t"
321         "gsldrc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
322         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
323         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
324         "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
325         "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
326         "gsldlc1    %[ftmp2],   0x07(%[block])                          \n\t"
327         "gsldrc1    %[ftmp2],   0x00(%[block])                          \n\t"
328         "gsldlc1    %[ftmp6],   0x0f(%[block])                          \n\t"
329         "gsldrc1    %[ftmp6],   0x08(%[block])                          \n\t"
330         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
331         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
332         "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
333         "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
334         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
335         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
336         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
337         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
338         "sdc1       %[ftmp0],   0x00(%[block])                          \n\t"
339         "gssdxc1    %[ftmp1],   0x00(%[block],  %[line_size])           \n\t"
340         "sdc1       %[ftmp4],   0x08(%[block])                          \n\t"
341         "gssdxc1    %[ftmp5],   0x08(%[block],  %[line_size])           \n\t"
342         PTR_ADDU   "%[pixels],  %[pixels],      %[addr2]                \n\t"
343         PTR_ADDU   "%[block],   %[block],       %[addr2]                \n\t"
344
345         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
346         "bnez       %[h],       1b                                      \n\t"
347         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
348           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
349           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
350           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
351           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
352           [addr2]"=&r"(addr[2]),
353           [block]"+&r"(block),              [pixels]"+&r"(pixels),
354           [h]"+&r"(h)
355         : [line_size]"r"((mips_reg)line_size)
356         : "memory"
357     );
358 }
359
360 inline void ff_put_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
361     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
362     int h)
363 {
364     double ftmp[4];
365     mips_reg addr[5];
366     uint64_t low32;
367
368     __asm__ volatile (
369         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
370         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
371         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
372         "1:                                                             \n\t"
373         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
374         "uld        %[low32],   0x00(%[src1])                           \n\t"
375         "mtc1       %[low32],   %[ftmp0]                                \n\t"
376         "uld        %[low32],   0x00(%[addr0])                          \n\t"
377         "mtc1       %[low32],   %[ftmp1]                                \n\t"
378         "uld        %[low32],   0x00(%[src2])                           \n\t"
379         "mtc1       %[low32],   %[ftmp2]                                \n\t"
380         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
381         "uld        %[low32],   0x00(%[addr1])                          \n\t"
382         "mtc1       %[low32],   %[ftmp3]                                \n\t"
383         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
384         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
385         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
386         "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
387         "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
388         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
389         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
390
391         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
392         "uld        %[low32],   0x00(%[src1])                           \n\t"
393         "mtc1       %[low32],   %[ftmp0]                                \n\t"
394         "uld        %[low32],   0x00(%[addr0])                          \n\t"
395         "mtc1       %[low32],   %[ftmp1]                                \n\t"
396         "uld        %[low32],   0x00(%[src2])                           \n\t"
397         "mtc1       %[low32],   %[ftmp2]                                \n\t"
398         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
399         "uld        %[low32],   0x00(%[addr1])                          \n\t"
400         "mtc1       %[low32],   %[ftmp3]                                \n\t"
401         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
402         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
403         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
404         "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
405         "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
406         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
407         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
408
409         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
410         "bnez       %[h],       1b                                      \n\t"
411         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
412           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
413           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
414           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
415           [addr4]"=&r"(addr[4]),
416           [low32]"=&r"(low32),
417           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
418           [src2]"+&r"(src2),                [h]"+&r"(h)
419         : [dst_stride]"r"((mips_reg)dst_stride),
420           [src_stride1]"r"((mips_reg)src_stride1),
421           [src_stride2]"r"((mips_reg)src_stride2)
422         : "memory"
423     );
424 }
425
426 inline void ff_put_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
427     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
428     int h)
429 {
430     double ftmp[4];
431     mips_reg addr[5];
432
433     __asm__ volatile (
434         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
435         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
436         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
437         "1:                                                             \n\t"
438         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
439         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
440         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
441         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
442         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
443         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
444         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
445         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
446         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
447         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
448         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
449         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
450         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
451         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
452         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
453         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
454         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
455
456         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
457         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
458         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
459         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
460         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
461         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
462         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
463         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
464         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
465         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
466         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
467         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
468         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
469         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
470         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
471         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
472         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
473
474         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
475         "bnez       %[h],       1b                                      \n\t"
476         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
477           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
478           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
479           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
480           [addr4]"=&r"(addr[4]),
481           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
482           [src2]"+&r"(src2),                [h]"+&r"(h)
483         : [dst_stride]"r"((mips_reg)dst_stride),
484           [src_stride1]"r"((mips_reg)src_stride1),
485           [src_stride2]"r"((mips_reg)src_stride2)
486         : "memory"
487     );
488 }
489
490 inline void ff_put_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
491     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
492     int h)
493 {
494     double ftmp[8];
495     mips_reg addr[5];
496
497     __asm__ volatile (
498         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
499         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
500         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
501         "1:                                                             \n\t"
502         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
503         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
504         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
505         "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
506         "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
507         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
508         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
509         "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
510         "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
511         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
512         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
513         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
514         "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
515         "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
516         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
517         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
518         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
519         "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
520         "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
521         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
522         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
523         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
524         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
525         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
526         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
527         "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
528         "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
529         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
530         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
531
532         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
533         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
534         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
535         "gsldlc1    %[ftmp4],   0x0f(%[src1])                           \n\t"
536         "gsldrc1    %[ftmp4],   0x08(%[src1])                           \n\t"
537         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
538         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
539         "gsldlc1    %[ftmp5],   0x0f(%[addr0])                          \n\t"
540         "gsldrc1    %[ftmp5],   0x08(%[addr0])                          \n\t"
541         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
542         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
543         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
544         "gsldlc1    %[ftmp6],   0x0f(%[src2])                           \n\t"
545         "gsldrc1    %[ftmp6],   0x08(%[src2])                           \n\t"
546         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
547         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
548         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
549         "gsldlc1    %[ftmp7],   0x0f(%[addr1])                          \n\t"
550         "gsldrc1    %[ftmp7],   0x08(%[addr1])                          \n\t"
551         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
552         "pavgb      %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
553         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
554         "pavgb      %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
555         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
556         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
557         "sdc1       %[ftmp4],   0x08(%[dst])                            \n\t"
558         "gssdxc1    %[ftmp5],   0x08(%[dst],    %[dst_stride])          \n\t"
559         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
560         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
561
562         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
563         "bnez       %[h],       1b                                      \n\t"
564         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
565           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
566           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
567           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
568           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
569           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
570           [addr4]"=&r"(addr[4]),
571           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
572           [src2]"+&r"(src2),                [h]"+&r"(h)
573         : [dst_stride]"r"((mips_reg)dst_stride),
574           [src_stride1]"r"((mips_reg)src_stride1),
575           [src_stride2]"r"((mips_reg)src_stride2)
576         : "memory"
577     );
578 }
579
580 inline void ff_avg_pixels4_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
581     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
582     int h)
583 {
584     double ftmp[6];
585     mips_reg addr[6];
586     uint64_t low32;
587
588     __asm__ volatile (
589         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
590         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
591         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
592         "1:                                                             \n\t"
593         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
594         "uld        %[low32],   0x00(%[src1])                           \n\t"
595         "mtc1       %[low32],   %[ftmp0]                                \n\t"
596         "uld        %[low32],   0x00(%[addr0])                          \n\t"
597         "mtc1       %[low32],   %[ftmp1]                                \n\t"
598         "uld        %[low32],   0x00(%[src2])                           \n\t"
599         "mtc1       %[low32],   %[ftmp2]                                \n\t"
600         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
601         "uld        %[low32],   0x00(%[addr1])                          \n\t"
602         "mtc1       %[low32],   %[ftmp3]                                \n\t"
603         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
604         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
605         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
606         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
607         "uld        %[low32],   0x00(%[dst])                            \n\t"
608         "mtc1       %[low32],   %[ftmp4]                                \n\t"
609         "uld        %[low32],   0x00(%[addr5])                          \n\t"
610         "mtc1       %[low32],   %[ftmp5]                                \n\t"
611         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
612         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
613         "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
614         "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
615         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
616         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
617
618         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
619         "uld        %[low32],   0x00(%[src1])                           \n\t"
620         "mtc1       %[low32],   %[ftmp0]                                \n\t"
621         "uld        %[low32],   0x00(%[addr0])                          \n\t"
622         "mtc1       %[low32],   %[ftmp1]                                \n\t"
623         "uld        %[low32],   0x00(%[src2])                           \n\t"
624         "mtc1       %[low32],   %[ftmp2]                                \n\t"
625         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
626         "uld        %[low32],   0x00(%[addr1])                          \n\t"
627         "mtc1       %[low32],   %[ftmp3]                                \n\t"
628         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
629         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
630         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
631         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
632         "uld        %[low32],   0x00(%[dst])                            \n\t"
633         "mtc1       %[low32],   %[ftmp4]                                \n\t"
634         "uld        %[low32],   0x00(%[addr5])                          \n\t"
635         "mtc1       %[low32],   %[ftmp5]                                \n\t"
636         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
637         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
638         "swc1       %[ftmp0],   0x00(%[dst])                            \n\t"
639         "gsswxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
640         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
641         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
642
643         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
644         "bnez       %[h],       1b                                      \n\t"
645         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
646           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
647           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
648           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
649           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
650           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
651           [low32]"=&r"(low32),
652           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
653           [src2]"+&r"(src2),                [h]"+&r"(h)
654         : [dst_stride]"r"((mips_reg)dst_stride),
655           [src_stride1]"r"((mips_reg)src_stride1),
656           [src_stride2]"r"((mips_reg)src_stride2)
657         : "memory"
658     );
659 }
660
661 inline void ff_avg_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
662     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
663     int h)
664 {
665     double ftmp[6];
666     mips_reg addr[6];
667
668     __asm__ volatile (
669         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
670         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
671         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
672         "1:                                                             \n\t"
673         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
674         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
675         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
676         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
677         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
678         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
679         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
680         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
681         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
682         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
683         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
684         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
685         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
686         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
687         "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
688         "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
689         "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
690         "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
691         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
692         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
693         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
694         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
695         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
696         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
697
698         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
699         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
700         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
701         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
702         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
703         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
704         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
705         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
706         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
707         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
708         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
709         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
710         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
711         PTR_ADDU   "%[addr5],   %[dst],         %[dst_stride]           \n\t"
712         "gsldlc1    %[ftmp4],   0x07(%[dst])                            \n\t"
713         "gsldrc1    %[ftmp4],   0x00(%[dst])                            \n\t"
714         "gsldlc1    %[ftmp5],   0x07(%[addr5])                          \n\t"
715         "gsldrc1    %[ftmp5],   0x00(%[addr5])                          \n\t"
716         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
717         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
718         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
719         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
720         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
721         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
722
723         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
724         "bnez       %[h],       1b                                      \n\t"
725         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
726           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
727           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
728           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
729           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
730           [addr4]"=&r"(addr[4]),            [addr5]"=&r"(addr[5]),
731           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
732           [src2]"+&r"(src2),                [h]"+&r"(h)
733         : [dst_stride]"r"((mips_reg)dst_stride),
734           [src_stride1]"r"((mips_reg)src_stride1),
735           [src_stride2]"r"((mips_reg)src_stride2)
736         : "memory"
737     );
738 }
739
740 inline void ff_avg_pixels16_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
741     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
742     int h)
743 {
744     ff_avg_pixels8_l2_8_mmi(dst, src1, src2, dst_stride, src_stride1,
745             src_stride2, h);
746     ff_avg_pixels8_l2_8_mmi(dst + 8, src1 + 8, src2 + 8, dst_stride,
747             src_stride1, src_stride2, h);
748 }
749
750 void ff_put_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
751     ptrdiff_t line_size, int h)
752 {
753     ff_put_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
754             line_size, h);
755 }
756
757 void ff_put_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
758     ptrdiff_t line_size, int h)
759 {
760     ff_put_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
761             line_size, h);
762 }
763
764 void ff_put_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
765     ptrdiff_t line_size, int h)
766 {
767     ff_put_pixels16_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
768             line_size, h);
769 }
770
771 void ff_avg_pixels4_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
772     ptrdiff_t line_size, int h)
773 {
774     ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
775             line_size, h);
776 }
777
778 void ff_avg_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
779     ptrdiff_t line_size, int h)
780 {
781     ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size, line_size,
782             line_size, h);
783 }
784
785 void ff_avg_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
786     ptrdiff_t line_size, int h)
787 {
788     ff_avg_pixels8_x2_8_mmi(block, pixels, line_size, h);
789     ff_avg_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
790 }
791
792 inline void ff_put_no_rnd_pixels8_l2_8_mmi(uint8_t *dst, const uint8_t *src1,
793     const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2,
794     int h)
795 {
796     double ftmp[5];
797     mips_reg addr[5];
798
799     __asm__ volatile (
800         "pcmpeqb    %[ftmp4],   %[ftmp4],       %[ftmp4]                \n\t"
801         PTR_ADDU   "%[addr2],   %[src_stride1], %[src_stride1]          \n\t"
802         PTR_ADDU   "%[addr3],   %[src_stride2], %[src_stride2]          \n\t"
803         PTR_ADDU   "%[addr4],   %[dst_stride],  %[dst_stride]           \n\t"
804         "1:                                                             \n\t"
805         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
806         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
807         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
808         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
809         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
810         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
811         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
812         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
813         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
814         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
815         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
816         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
817         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
818         "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
819         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
820         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
821         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
822         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
823         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
824         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
825         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
826         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
827         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
828
829         "gsldlc1    %[ftmp0],   0x07(%[src1])                           \n\t"
830         PTR_ADDU   "%[addr0],   %[src1],        %[src_stride1]          \n\t"
831         "gsldrc1    %[ftmp0],   0x00(%[src1])                           \n\t"
832         "gsldlc1    %[ftmp1],   0x07(%[addr0])                          \n\t"
833         "gsldrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
834         "gsldlc1    %[ftmp2],   0x07(%[src2])                           \n\t"
835         PTR_ADDU   "%[addr1],   %[src2],        %[src_stride2]          \n\t"
836         "gsldrc1    %[ftmp2],   0x00(%[src2])                           \n\t"
837         "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
838         PTR_ADDU   "%[src1],    %[src1],        %[addr2]                \n\t"
839         "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
840         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
841         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
842         "xor        %[ftmp2],   %[ftmp2],       %[ftmp4]                \n\t"
843         "xor        %[ftmp3],   %[ftmp3],       %[ftmp4]                \n\t"
844         "pavgb      %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
845         "pavgb      %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
846         "xor        %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
847         "xor        %[ftmp1],   %[ftmp1],       %[ftmp4]                \n\t"
848         "sdc1       %[ftmp0],   0x00(%[dst])                            \n\t"
849         "gssdxc1    %[ftmp1],   0x00(%[dst],    %[dst_stride])          \n\t"
850         PTR_ADDU   "%[src2],    %[src2],        %[addr3]                \n\t"
851         PTR_ADDU   "%[dst],     %[dst],         %[addr4]                \n\t"
852
853         PTR_ADDI   "%[h],       %[h],           -0x04                   \n\t"
854         "bnez       %[h],       1b                                      \n\t"
855         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
856           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
857           [ftmp4]"=&f"(ftmp[4]),
858           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
859           [addr2]"=&r"(addr[2]),            [addr3]"=&r"(addr[3]),
860           [addr4]"=&r"(addr[4]),
861           [dst]"+&r"(dst),                  [src1]"+&r"(src1),
862           [src2]"+&r"(src2),                [h]"+&r"(h)
863         : [dst_stride]"r"((mips_reg)dst_stride),
864           [src_stride1]"r"((mips_reg)src_stride1),
865           [src_stride2]"r"((mips_reg)src_stride2)
866         : "memory"
867     );
868 }
869
870 void ff_put_no_rnd_pixels8_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
871     ptrdiff_t line_size, int h)
872 {
873     ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + 1, line_size,
874             line_size, line_size, h);
875 }
876
877 void ff_put_no_rnd_pixels16_x2_8_mmi(uint8_t *block, const uint8_t *pixels,
878     ptrdiff_t line_size, int h)
879 {
880     ff_put_no_rnd_pixels8_x2_8_mmi(block, pixels, line_size, h);
881     ff_put_no_rnd_pixels8_x2_8_mmi(block + 8, pixels + 8, line_size, h);
882 }
883
884 void ff_put_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
885     ptrdiff_t line_size, int h)
886 {
887     ff_put_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
888             line_size, line_size, h);
889 }
890
891 void ff_put_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
892     ptrdiff_t line_size, int h)
893 {
894     ff_put_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
895             line_size, line_size, h);
896 }
897
898 void ff_put_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
899     ptrdiff_t line_size, int h)
900 {
901     ff_put_pixels16_l2_8_mmi(block, pixels, pixels + line_size, line_size,
902             line_size, line_size, h);
903 }
904
905 void ff_avg_pixels4_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
906     ptrdiff_t line_size, int h)
907 {
908     ff_avg_pixels4_l2_8_mmi(block, pixels, pixels + line_size, line_size,
909             line_size, line_size, h);
910 }
911
912 void ff_avg_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
913     ptrdiff_t line_size, int h)
914 {
915     ff_avg_pixels8_l2_8_mmi(block, pixels, pixels + line_size, line_size,
916             line_size, line_size, h);
917 }
918
919 void ff_avg_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
920     ptrdiff_t line_size, int h)
921 {
922     ff_avg_pixels8_y2_8_mmi(block, pixels, line_size, h);
923     ff_avg_pixels8_y2_8_mmi(block + 8, pixels + 8, line_size, h);
924 }
925
926 void ff_put_no_rnd_pixels8_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
927     ptrdiff_t line_size, int h)
928 {
929     ff_put_no_rnd_pixels8_l2_8_mmi(block, pixels, pixels + line_size,
930             line_size, line_size, line_size, h);
931 }
932
933 void ff_put_no_rnd_pixels16_y2_8_mmi(uint8_t *block, const uint8_t *pixels,
934     ptrdiff_t line_size, int h)
935 {
936     ff_put_no_rnd_pixels8_y2_8_mmi(block, pixels, line_size, h);
937     ff_put_no_rnd_pixels8_y2_8_mmi(block + 8 , pixels + 8, line_size, h);
938 }
939
940 void ff_put_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
941     ptrdiff_t line_size, int h)
942 {
943     /* FIXME HIGH BIT DEPTH */
944     int i;
945     const uint32_t a = AV_RN32(pixels);
946     const uint32_t b = AV_RN32(pixels + 1);
947     uint32_t l0 = (a & 0x03030303UL) +
948                   (b & 0x03030303UL) +
949                        0x02020202UL;
950     uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
951                   ((b & 0xFCFCFCFCUL) >> 2);
952     uint32_t l1, h1;
953
954     pixels += line_size;
955     for (i = 0; i < h; i += 2) {
956         uint32_t a = AV_RN32(pixels);
957         uint32_t b = AV_RN32(pixels + 1);
958         l1 = (a & 0x03030303UL) +
959              (b & 0x03030303UL);
960         h1 = ((a & 0xFCFCFCFCUL) >> 2) +
961              ((b & 0xFCFCFCFCUL) >> 2);
962         *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
963         pixels += line_size;
964         block  += line_size;
965         a  = AV_RN32(pixels);
966         b  = AV_RN32(pixels + 1);
967         l0 = (a & 0x03030303UL) +
968              (b & 0x03030303UL) +
969                   0x02020202UL;
970         h0 = ((a & 0xFCFCFCFCUL) >> 2) +
971              ((b & 0xFCFCFCFCUL) >> 2);
972         *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
973         pixels += line_size;
974         block  += line_size;
975     }
976 }
977
978 void ff_put_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
979     ptrdiff_t line_size, int h)
980 {
981 #if 1
982     double ftmp[10];
983     mips_reg addr[2];
984
985     __asm__ volatile (
986         "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]                \n\t"
987         "dli        %[addr0],   0x0f                                    \n\t"
988         "pcmpeqw    %[ftmp6],   %[ftmp6],       %[ftmp6]                \n\t"
989         "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
990         "dli        %[addr0],   0x01                                    \n\t"
991         "psrlh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
992         "dmtc1      %[addr0],   %[ftmp8]                                \n\t"
993         "psllh      %[ftmp6],   %[ftmp6],       %[ftmp8]                \n\t"
994
995         "dli        %[addr0],   0x02                                    \n\t"
996         "gsldlc1    %[ftmp0],   0x07(%[pixels])                         \n\t"
997         "gsldrc1    %[ftmp0],   0x00(%[pixels])                         \n\t"
998         "dmtc1      %[addr0],   %[ftmp9]                                \n\t"
999         "gsldlc1    %[ftmp4],   0x08(%[pixels])                         \n\t"
1000         "gsldrc1    %[ftmp4],   0x01(%[pixels])                         \n\t"
1001         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
1002         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
1003         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
1004         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
1005         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
1006         "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1007         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1008         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
1009         "xor        %[addr0],   %[addr0],       %[addr0]                \n\t"
1010         PTR_ADDU   "%[pixels],  %[pixels],      %[line_size]            \n\t"
1011         ".p2align   3                                                   \n\t"
1012         "1:                                                             \n\t"
1013         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
1014         "gsldlc1    %[ftmp0],   0x07(%[addr1])                          \n\t"
1015         "gsldrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
1016         "gsldlc1    %[ftmp2],   0x08(%[addr1])                          \n\t"
1017         "gsldrc1    %[ftmp2],   0x01(%[addr1])                          \n\t"
1018         "mov.d      %[ftmp1],   %[ftmp0]                                \n\t"
1019         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
1020         "punpcklbh  %[ftmp0],   %[ftmp0],       %[ftmp7]                \n\t"
1021         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1022         "punpckhbh  %[ftmp1],   %[ftmp1],       %[ftmp7]                \n\t"
1023         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1024         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp2]                \n\t"
1025         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp3]                \n\t"
1026         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp6]                \n\t"
1027         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp6]                \n\t"
1028         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp0]                \n\t"
1029         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp1]                \n\t"
1030         "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp9]                \n\t"
1031         "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp9]                \n\t"
1032         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]                \n\t"
1033         "gssdxc1    %[ftmp4],   0x00(%[block],  %[addr0])               \n\t"
1034         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
1035         PTR_ADDU   "%[addr1],   %[pixels],      %[addr0]                \n\t"
1036         "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
1037         "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
1038         "gsldlc1    %[ftmp4],   0x08(%[addr1])                          \n\t"
1039         "gsldrc1    %[ftmp4],   0x01(%[addr1])                          \n\t"
1040         "mov.d      %[ftmp3],   %[ftmp2]                                \n\t"
1041         "mov.d      %[ftmp5],   %[ftmp4]                                \n\t"
1042         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp7]                \n\t"
1043         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp7]                \n\t"
1044         "punpckhbh  %[ftmp3],   %[ftmp3],       %[ftmp7]                \n\t"
1045         "punpckhbh  %[ftmp5],   %[ftmp5],       %[ftmp7]                \n\t"
1046         "paddush    %[ftmp4],   %[ftmp4],       %[ftmp2]                \n\t"
1047         "paddush    %[ftmp5],   %[ftmp5],       %[ftmp3]                \n\t"
1048         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp6]                \n\t"
1049         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp6]                \n\t"
1050         "paddush    %[ftmp0],   %[ftmp0],       %[ftmp4]                \n\t"
1051         "paddush    %[ftmp1],   %[ftmp1],       %[ftmp5]                \n\t"
1052         "psrlh      %[ftmp0],   %[ftmp0],       %[ftmp9]                \n\t"
1053         "psrlh      %[ftmp1],   %[ftmp1],       %[ftmp9]                \n\t"
1054         "packushb   %[ftmp0],   %[ftmp0],       %[ftmp1]                \n\t"
1055         "gssdxc1    %[ftmp0],   0x00(%[block],  %[addr0])               \n\t"
1056         PTR_ADDU   "%[addr0],   %[addr0],       %[line_size]            \n\t"
1057         PTR_ADDU   "%[h],       %[h],           -0x02                   \n\t"
1058         "bnez       %[h],       1b                                      \n\t"
1059         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1060           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1061           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1062           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1063           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1064           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1065           [h]"+&r"(h),                      [pixels]"+&r"(pixels)
1066         : [block]"r"(block),                [line_size]"r"((mips_reg)line_size)
1067         : "memory"
1068     );
1069 #else
1070     /* FIXME HIGH BIT DEPTH */
1071     int j;
1072
1073     for (j = 0; j < 2; j++) {
1074         int i;
1075         const uint32_t a = AV_RN32(pixels);
1076         const uint32_t b = AV_RN32(pixels + 1);
1077         uint32_t l0 = (a & 0x03030303UL) +
1078                       (b & 0x03030303UL) +
1079                            0x02020202UL;
1080         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1081                       ((b & 0xFCFCFCFCUL) >> 2);
1082         uint32_t l1, h1;
1083
1084         pixels += line_size;
1085         for (i = 0; i < h; i += 2) {
1086             uint32_t a = AV_RN32(pixels);
1087             uint32_t b = AV_RN32(pixels + 1);
1088             l1 = (a & 0x03030303UL) +
1089                  (b & 0x03030303UL);
1090             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1091                  ((b & 0xFCFCFCFCUL) >> 2);
1092             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1093             pixels += line_size;
1094             block  += line_size;
1095             a  = AV_RN32(pixels);
1096             b  = AV_RN32(pixels + 1);
1097             l0 = (a & 0x03030303UL) +
1098                  (b & 0x03030303UL) +
1099                       0x02020202UL;
1100             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1101                  ((b & 0xFCFCFCFCUL) >> 2);
1102             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1103             pixels += line_size;
1104             block  += line_size;
1105         }
1106         pixels += 4 - line_size * (h + 1);
1107         block  += 4 - line_size * h;
1108     }
1109 #endif
1110 }
1111
1112 void ff_put_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1113     ptrdiff_t line_size, int h)
1114 {
1115     ff_put_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1116     ff_put_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1117 }
1118
1119 void ff_avg_pixels4_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1120     ptrdiff_t line_size, int h)
1121 {
1122     /* FIXME HIGH BIT DEPTH */
1123     int i;
1124     const uint32_t a = AV_RN32(pixels);
1125     const uint32_t b = AV_RN32(pixels + 1);
1126     uint32_t l0 = (a & 0x03030303UL) +
1127                   (b & 0x03030303UL) +
1128                        0x02020202UL;
1129     uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1130                   ((b & 0xFCFCFCFCUL) >> 2);
1131     uint32_t l1, h1;
1132
1133     pixels += line_size;
1134     for (i = 0; i < h; i += 2) {
1135         uint32_t a = AV_RN32(pixels);
1136         uint32_t b = AV_RN32(pixels + 1);
1137         l1 = (a & 0x03030303UL) +
1138              (b & 0x03030303UL);
1139         h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1140              ((b & 0xFCFCFCFCUL) >> 2);
1141         *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1142         pixels += line_size;
1143         block  += line_size;
1144         a  = AV_RN32(pixels);
1145         b  = AV_RN32(pixels + 1);
1146         l0 = (a & 0x03030303UL) +
1147              (b & 0x03030303UL) +
1148                   0x02020202UL;
1149         h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1150              ((b & 0xFCFCFCFCUL) >> 2);
1151         *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1152         pixels += line_size;
1153         block  += line_size;
1154     }
1155 }
1156
1157 void ff_avg_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1158     ptrdiff_t line_size, int h)
1159 {
1160     /* FIXME HIGH BIT DEPTH */
1161     int j;
1162
1163     for (j = 0; j < 2; j++) {
1164         int i;
1165         const uint32_t a = AV_RN32(pixels);
1166         const uint32_t b = AV_RN32(pixels + 1);
1167         uint32_t l0 = (a & 0x03030303UL) +
1168                       (b & 0x03030303UL) +
1169                            0x02020202UL;
1170         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1171                       ((b & 0xFCFCFCFCUL) >> 2);
1172         uint32_t l1, h1;
1173
1174         pixels += line_size;
1175         for (i = 0; i < h; i += 2) {
1176             uint32_t a = AV_RN32(pixels);
1177             uint32_t b = AV_RN32(pixels + 1);
1178             l1 = (a & 0x03030303UL) +
1179                  (b & 0x03030303UL);
1180             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1181                  ((b & 0xFCFCFCFCUL) >> 2);
1182             *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1183             pixels += line_size;
1184             block  += line_size;
1185             a  = AV_RN32(pixels);
1186             b  = AV_RN32(pixels + 1);
1187             l0 = (a & 0x03030303UL) +
1188                  (b & 0x03030303UL) +
1189                       0x02020202UL;
1190             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1191                  ((b & 0xFCFCFCFCUL) >> 2);
1192             *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
1193             pixels += line_size;
1194             block  += line_size;
1195         }
1196         pixels += 4 - line_size * (h + 1);
1197         block  += 4 - line_size * h;
1198     }
1199 }
1200
1201 void ff_avg_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1202     ptrdiff_t line_size, int h)
1203 {
1204     ff_avg_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1205     ff_avg_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1206 }
1207
1208 void ff_put_no_rnd_pixels8_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1209     ptrdiff_t line_size, int h)
1210 {
1211     /* FIXME HIGH BIT DEPTH */
1212     int j;
1213
1214     for (j = 0; j < 2; j++) {
1215         int i;
1216         const uint32_t a = AV_RN32(pixels);
1217         const uint32_t b = AV_RN32(pixels + 1);
1218         uint32_t l0 = (a & 0x03030303UL) +
1219                       (b & 0x03030303UL) +
1220                            0x01010101UL;
1221         uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1222                       ((b & 0xFCFCFCFCUL) >> 2);
1223         uint32_t l1, h1;
1224
1225         pixels += line_size;
1226         for (i = 0; i < h; i += 2) {
1227             uint32_t a = AV_RN32(pixels);
1228             uint32_t b = AV_RN32(pixels + 1);
1229             l1 = (a & 0x03030303UL) +
1230                  (b & 0x03030303UL);
1231             h1 = ((a & 0xFCFCFCFCUL) >> 2) +
1232                  ((b & 0xFCFCFCFCUL) >> 2);
1233             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1234             pixels += line_size;
1235             block  += line_size;
1236             a  = AV_RN32(pixels);
1237             b  = AV_RN32(pixels + 1);
1238             l0 = (a & 0x03030303UL) +
1239                  (b & 0x03030303UL) +
1240                       0x01010101UL;
1241             h0 = ((a & 0xFCFCFCFCUL) >> 2) +
1242                  ((b & 0xFCFCFCFCUL) >> 2);
1243             *((uint32_t *) block) = h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1244             pixels += line_size;
1245             block  += line_size;
1246         }
1247         pixels += 4 - line_size * (h + 1);
1248         block  += 4 - line_size * h;
1249     }
1250 }
1251
1252 void ff_put_no_rnd_pixels16_xy2_8_mmi(uint8_t *block, const uint8_t *pixels,
1253     ptrdiff_t line_size, int h)
1254 {
1255     ff_put_no_rnd_pixels8_xy2_8_mmi(block, pixels, line_size, h);
1256     ff_put_no_rnd_pixels8_xy2_8_mmi(block + 8, pixels + 8, line_size, h);
1257 }