]> git.sesse.net Git - ffmpeg/blob - libavcodec/i386/vp3dsp_mmx.c
correct MMX-optimized variant of VP3 IDCT, with comments (thank you
[ffmpeg] / libavcodec / i386 / vp3dsp_mmx.c
1 /*
2  * Copyright (C) 2004 the ffmpeg project
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, write to the Free Software
16  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17  */
18
19 /**
20  * @file vp3dsp_mmx.c
21  * MMX-optimized functions cribbed from the original VP3 source code.
22  */
23
24 #include "../dsputil.h"
25 #include "mmx.h"
26
27 #define IdctAdjustBeforeShift 8
28
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total)
30  * idct_constants[0..15] = Mask table (M(I))
31  * idct_constants[16..43] = Cosine table (C(I))
32  * idct_constants[44..47] = 8
33  */
34 static uint16_t idct_constants[(4 + 7 + 1) * 4];
35 static uint16_t idct_cosine_table[7] = {
36     64277, 60547, 54491, 46341, 36410, 25080, 12785
37 };
38
39 #define r0 mm0
40 #define r1 mm1
41 #define r2 mm2
42 #define r3 mm3
43 #define r4 mm4
44 #define r5 mm5
45 #define r6 mm6
46 #define r7 mm7
47
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */
49 #define BeginIDCT() { \
50     movq_m2r(*I(3), r2); \
51     movq_m2r(*C(3), r6); \
52     movq_r2r(r2, r4); \
53     movq_m2r(*J(5), r7); \
54     pmulhw_r2r(r6, r4);       /* r4 = c3*i3 - i3 */ \
55     movq_m2r(*C(5), r1); \
56     pmulhw_r2r(r7, r6);       /* r6 = c3*i5 - i5 */ \
57     movq_r2r(r1, r5); \
58     pmulhw_r2r(r2, r1);       /* r1 = c5*i3 - i3 */ \
59     movq_m2r(*I(1), r3); \
60     pmulhw_r2r(r7, r5);       /* r5 = c5*i5 - i5 */ \
61     movq_m2r(*C(1), r0);      /* (all registers are in use) */ \
62     paddw_r2r(r2, r4);        /* r4 = c3*i3 */ \
63     paddw_r2r(r7, r6);        /* r6 = c3*i5 */ \
64     paddw_r2r(r1, r2);        /* r2 = c5*i3 */ \
65     movq_m2r(*J(7), r1); \
66     paddw_r2r(r5, r7);        /* r7 = c5*i5 */ \
67     movq_r2r(r0, r5);         /* r5 = c1 */ \
68     pmulhw_r2r(r3, r0);       /* r0 = c1*i1 - i1 */ \
69     paddsw_r2r(r7, r4);       /* r4 = C = c3*i3 + c5*i5 */ \
70     pmulhw_r2r(r1, r5);       /* r5 = c1*i7 - i7 */ \
71     movq_m2r(*C(7), r7); \
72     psubsw_r2r(r2, r6);       /* r6 = D = c3*i5 - c5*i3 */ \
73     paddw_r2r(r3, r0);        /* r0 = c1*i1 */ \
74     pmulhw_r2r(r7, r3);       /* r3 = c7*i1 */ \
75     movq_m2r(*I(2), r2); \
76     pmulhw_r2r(r1, r7);       /* r7 = c7*i7 */ \
77     paddw_r2r(r1, r5);        /* r5 = c1*i7 */ \
78     movq_r2r(r2, r1);         /* r1 = i2 */ \
79     pmulhw_m2r(*C(2), r2);    /* r2 = c2*i2 - i2 */ \
80     psubsw_r2r(r5, r3);       /* r3 = B = c7*i1 - c1*i7 */ \
81     movq_m2r(*J(6), r5); \
82     paddsw_r2r(r7, r0);       /* r0 = A = c1*i1 + c7*i7 */ \
83     movq_r2r(r5, r7);         /* r7 = i6 */ \
84     psubsw_r2r(r4, r0);       /* r0 = A - C */ \
85     pmulhw_m2r(*C(2), r5);    /* r5 = c2*i6 - i6 */ \
86     paddw_r2r(r1, r2);        /* r2 = c2*i2 */ \
87     pmulhw_m2r(*C(6), r1);    /* r1 = c6*i2 */ \
88     paddsw_r2r(r4, r4);       /* r4 = C + C */ \
89     paddsw_r2r(r0, r4);       /* r4 = C. = A + C */ \
90     psubsw_r2r(r6, r3);       /* r3 = B - D */ \
91     paddw_r2r(r7, r5);        /* r5 = c2*i6 */ \
92     paddsw_r2r(r6, r6);       /* r6 = D + D */ \
93     pmulhw_m2r(*C(6), r7);    /* r7 = c6*i6 */ \
94     paddsw_r2r(r3, r6);       /* r6 = D. = B + D */ \
95     movq_r2m(r4, *I(1));      /* save C. at I(1) */ \
96     psubsw_r2r(r5, r1);       /* r1 = H = c6*i2 - c2*i6 */ \
97     movq_m2r(*C(4), r4); \
98     movq_r2r(r3, r5);         /* r5 = B - D */ \
99     pmulhw_r2r(r4, r3);       /* r3 = (c4 - 1) * (B - D) */ \
100     paddsw_r2r(r2, r7);       /* r7 = G = c6*i6 + c2*i2 */ \
101     movq_r2m(r6, *I(2));      /* save D. at I(2) */ \
102     movq_r2r(r0, r2);         /* r2 = A - C */ \
103     movq_m2r(*I(0), r6); \
104     pmulhw_r2r(r4, r0);       /* r0 = (c4 - 1) * (A - C) */ \
105     paddw_r2r(r3, r5);        /* r5 = B. = c4 * (B - D) */ \
106     movq_m2r(*J(4), r3); \
107     psubsw_r2r(r1, r5);       /* r5 = B.. = B. - H */ \
108     paddw_r2r(r0, r2);        /* r0 = A. = c4 * (A - C) */ \
109     psubsw_r2r(r3, r6);       /* r6 = i0 - i4 */ \
110     movq_r2r(r6, r0); \
111     pmulhw_r2r(r4, r6);       /* r6 = (c4 - 1) * (i0 - i4) */ \
112     paddsw_r2r(r3, r3);       /* r3 = i4 + i4 */ \
113     paddsw_r2r(r1, r1);       /* r1 = H + H */ \
114     paddsw_r2r(r0, r3);       /* r3 = i0 + i4 */ \
115     paddsw_r2r(r5, r1);       /* r1 = H. = B + H */ \
116     pmulhw_r2r(r3, r4);       /* r4 = (c4 - 1) * (i0 + i4) */ \
117     paddsw_r2r(r0, r6);       /* r6 = F = c4 * (i0 - i4) */ \
118     psubsw_r2r(r2, r6);       /* r6 = F. = F - A. */ \
119     paddsw_r2r(r2, r2);       /* r2 = A. + A. */ \
120     movq_m2r(*I(1), r0);      /* r0 = C. */ \
121     paddsw_r2r(r6, r2);       /* r2 = A.. = F + A. */ \
122     paddw_r2r(r3, r4);        /* r4 = E = c4 * (i0 + i4) */ \
123     psubsw_r2r(r1, r2);       /* r2 = R2 = A.. - H. */ \
124 }
125
126 /* RowIDCT gets ready to transpose */
127 #define RowIDCT() { \
128     \
129     BeginIDCT(); \
130     \
131     movq_m2r(*I(2), r3);   /* r3 = D. */ \
132     psubsw_r2r(r7, r4);    /* r4 = E. = E - G */ \
133     paddsw_r2r(r1, r1);    /* r1 = H. + H. */ \
134     paddsw_r2r(r7, r7);    /* r7 = G + G */ \
135     paddsw_r2r(r2, r1);    /* r1 = R1 = A.. + H. */ \
136     paddsw_r2r(r4, r7);    /* r7 = G. = E + G */ \
137     psubsw_r2r(r3, r4);    /* r4 = R4 = E. - D. */ \
138     paddsw_r2r(r3, r3); \
139     psubsw_r2r(r5, r6);    /* r6 = R6 = F. - B.. */ \
140     paddsw_r2r(r5, r5); \
141     paddsw_r2r(r4, r3);    /* r3 = R3 = E. + D. */ \
142     paddsw_r2r(r6, r5);    /* r5 = R5 = F. + B.. */ \
143     psubsw_r2r(r0, r7);    /* r7 = R7 = G. - C. */ \
144     paddsw_r2r(r0, r0); \
145     movq_r2m(r1, *I(1));   /* save R1 */ \
146     paddsw_r2r(r7, r0);    /* r0 = R0 = G. + C. */ \
147 }
148
149 /* Column IDCT normalizes and stores final results */
150 #define ColumnIDCT() { \
151     \
152     BeginIDCT(); \
153     \
154     paddsw_m2r(*Eight, r2);    /* adjust R2 (and R1) for shift */ \
155     paddsw_r2r(r1, r1);        /* r1 = H. + H. */ \
156     paddsw_r2r(r2, r1);        /* r1 = R1 = A.. + H. */ \
157     psraw_i2r(4, r2);          /* r2 = NR2 */ \
158     psubsw_r2r(r7, r4);        /* r4 = E. = E - G */ \
159     psraw_i2r(4, r1);          /* r1 = NR1 */ \
160     movq_m2r(*I(2), r3);       /* r3 = D. */ \
161     paddsw_r2r(r7, r7);        /* r7 = G + G */ \
162     movq_r2m(r2, *I(2));       /* store NR2 at I2 */ \
163     paddsw_r2r(r4, r7);        /* r7 = G. = E + G */ \
164     movq_r2m(r1, *I(1));       /* store NR1 at I1 */ \
165     psubsw_r2r(r3, r4);        /* r4 = R4 = E. - D. */ \
166     paddsw_m2r(*Eight, r4);    /* adjust R4 (and R3) for shift */ \
167     paddsw_r2r(r3, r3);        /* r3 = D. + D. */ \
168     paddsw_r2r(r4, r3);        /* r3 = R3 = E. + D. */ \
169     psraw_i2r(4, r4);          /* r4 = NR4 */ \
170     psubsw_r2r(r5, r6);        /* r6 = R6 = F. - B.. */ \
171     psraw_i2r(4, r3);          /* r3 = NR3 */ \
172     paddsw_m2r(*Eight, r6);    /* adjust R6 (and R5) for shift */ \
173     paddsw_r2r(r5, r5);        /* r5 = B.. + B.. */ \
174     paddsw_r2r(r6, r5);        /* r5 = R5 = F. + B.. */ \
175     psraw_i2r(4, r6);          /* r6 = NR6 */ \
176     movq_r2m(r4, *J(4));       /* store NR4 at J4 */ \
177     psraw_i2r(4, r5);          /* r5 = NR5 */ \
178     movq_r2m(r3, *I(3));       /* store NR3 at I3 */ \
179     psubsw_r2r(r0, r7);        /* r7 = R7 = G. - C. */ \
180     paddsw_m2r(*Eight, r7);    /* adjust R7 (and R0) for shift */ \
181     paddsw_r2r(r0, r0);        /* r0 = C. + C. */ \
182     paddsw_r2r(r7, r0);        /* r0 = R0 = G. + C. */ \
183     psraw_i2r(4, r7);          /* r7 = NR7 */ \
184     movq_r2m(r6, *J(6));       /* store NR6 at J6 */ \
185     psraw_i2r(4, r0);          /* r0 = NR0 */ \
186     movq_r2m(r5, *J(5));       /* store NR5 at J5 */ \
187     movq_r2m(r7, *J(7));       /* store NR7 at J7 */ \
188     movq_r2m(r0, *I(0));       /* store NR0 at I0 */ \
189 }
190
191 /* Following macro does two 4x4 transposes in place.
192
193   At entry (we assume):
194
195     r0 = a3 a2 a1 a0
196     I(1) = b3 b2 b1 b0
197     r2 = c3 c2 c1 c0
198     r3 = d3 d2 d1 d0
199
200     r4 = e3 e2 e1 e0
201     r5 = f3 f2 f1 f0
202     r6 = g3 g2 g1 g0
203     r7 = h3 h2 h1 h0
204
205   At exit, we have:
206
207     I(0) = d0 c0 b0 a0
208     I(1) = d1 c1 b1 a1
209     I(2) = d2 c2 b2 a2
210     I(3) = d3 c3 b3 a3
211     
212     J(4) = h0 g0 f0 e0
213     J(5) = h1 g1 f1 e1
214     J(6) = h2 g2 f2 e2
215     J(7) = h3 g3 f3 e3
216
217    I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
218    J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
219
220    Since r1 is free at entry, we calculate the Js first. */
221
222 #define Transpose() { \
223     movq_r2r(r4, r1);         /* r1 = e3 e2 e1 e0 */ \
224     punpcklwd_r2r(r5, r4);    /* r4 = f1 e1 f0 e0 */ \
225     movq_r2m(r0, *I(0));      /* save a3 a2 a1 a0 */ \
226     punpckhwd_r2r(r5, r1);    /* r1 = f3 e3 f2 e2 */ \
227     movq_r2r(r6, r0);         /* r0 = g3 g2 g1 g0 */ \
228     punpcklwd_r2r(r7, r6);    /* r6 = h1 g1 h0 g0 */ \
229     movq_r2r(r4, r5);         /* r5 = f1 e1 f0 e0 */ \
230     punpckldq_r2r(r6, r4);    /* r4 = h0 g0 f0 e0 = R4 */ \
231     punpckhdq_r2r(r6, r5);    /* r5 = h1 g1 f1 e1 = R5 */ \
232     movq_r2r(r1, r6);         /* r6 = f3 e3 f2 e2 */ \
233     movq_r2m(r4, *J(4)); \
234     punpckhwd_r2r(r7, r0);    /* r0 = h3 g3 h2 g2 */ \
235     movq_r2m(r5, *J(5)); \
236     punpckhdq_r2r(r0, r6);    /* r6 = h3 g3 f3 e3 = R7 */ \
237     movq_m2r(*I(0), r4);      /* r4 = a3 a2 a1 a0 */ \
238     punpckldq_r2r(r0, r1);    /* r1 = h2 g2 f2 e2 = R6 */ \
239     movq_m2r(*I(1), r5);      /* r5 = b3 b2 b1 b0 */ \
240     movq_r2r(r4, r0);         /* r0 = a3 a2 a1 a0 */ \
241     movq_r2m(r6, *J(7)); \
242     punpcklwd_r2r(r5, r0);    /* r0 = b1 a1 b0 a0 */ \
243     movq_r2m(r1, *J(6)); \
244     punpckhwd_r2r(r5, r4);    /* r4 = b3 a3 b2 a2 */ \
245     movq_r2r(r2, r5);         /* r5 = c3 c2 c1 c0 */ \
246     punpcklwd_r2r(r3, r2);    /* r2 = d1 c1 d0 c0 */ \
247     movq_r2r(r0, r1);         /* r1 = b1 a1 b0 a0 */ \
248     punpckldq_r2r(r2, r0);    /* r0 = d0 c0 b0 a0 = R0 */ \
249     punpckhdq_r2r(r2, r1);    /* r1 = d1 c1 b1 a1 = R1 */ \
250     movq_r2r(r4, r2);         /* r2 = b3 a3 b2 a2 */ \
251     movq_r2m(r0, *I(0)); \
252     punpckhwd_r2r(r3, r5);    /* r5 = d3 c3 d2 c2 */ \
253     movq_r2m(r1, *I(1)); \
254     punpckhdq_r2r(r5, r4);    /* r4 = d3 c3 b3 a3 = R3 */ \
255     punpckldq_r2r(r5, r2);    /* r2 = d2 c2 b2 a2 = R2 */ \
256     movq_r2m(r4, *I(3)); \
257     movq_r2m(r2, *I(2)); \
258 }
259
260 void vp3_dsp_init_mmx(void)
261 {
262     int j = 16;
263     uint16_t *p;
264
265     do {
266         idct_constants[--j] = 0;
267     } while (j);
268
269     idct_constants[0]  = idct_constants[5] =
270     idct_constants[10] = idct_constants[15] = 65535;
271
272     j = 1;
273     do {
274         p = idct_constants + ((j + 3) << 2);
275         p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1];
276     } while (++j <= 7);
277
278     idct_constants[44] = idct_constants[45] =
279     idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
280 }
281
282 static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
283      int16_t *output_data)
284 {
285     /* eax = quantized input
286      * ebx = dequantizer matrix
287      * ecx = IDCT constants
288      *  M(I) = ecx + MaskOffset(0) + I * 8
289      *  C(I) = ecx + CosineOffset(32) + (I-1) * 8
290      * edx = output
291      * r0..r7 = mm0..mm7
292      */
293
294 #define M(x) (idct_constants + x * 4)
295 #define C(x) (idct_constants + 16 + (x - 1) * 4)
296 #define Eight (idct_constants + 44)
297
298     unsigned char *input_bytes = (unsigned char *)input_data;
299     unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
300     unsigned char *output_data_bytes = (unsigned char *)output_data;
301
302     movq_m2r(*(input_bytes), r0);
303     pmullw_m2r(*(dequant_matrix_bytes), r0);       /* r0 = 03 02 01 00 */
304     movq_m2r(*(input_bytes+16), r1);
305     pmullw_m2r(*(dequant_matrix_bytes+16), r1);    /* r1 = 13 12 11 10 */
306     movq_m2r(*M(0), r2);                           /* r2 = __ __ __ FF */
307     movq_r2r(r0, r3);                              /* r3 = 03 02 01 00 */
308     movq_m2r(*(input_bytes+8), r4);
309     psrlq_i2r(16, r0);                             /* r0 = __ 03 02 01 */
310     pmullw_m2r(*(dequant_matrix_bytes+8), r4);     /* r4 = 07 06 05 04 */
311     pand_r2r(r2, r3);                              /* r3 = __ __ __ 00 */
312     movq_r2r(r0, r5);                              /* r5 = __ 03 02 01 */
313     movq_r2r(r1, r6);                              /* r6 = 13 12 11 10 */
314     pand_r2r(r2, r5);                              /* r5 = __ __ __ 01 */
315     psllq_i2r(32, r6);                             /* r6 = 11 10 __ __ */
316     movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
317     pxor_r2r(r5, r0);                              /* r0 = __ 03 02 __ */
318     pand_r2r(r6, r7);                              /* r7 = 11 __ __ __ */
319     por_r2r(r3, r0);                               /* r0 = __ 03 02 00 */
320     pxor_r2r(r7, r6);                              /* r6 = __ 10 __ __ */
321     por_r2r(r7, r0);                               /* r0 = 11 03 02 00 = R0 */
322     movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
323     movq_r2r(r4, r3);                              /* r3 = 07 06 05 04 */
324     movq_r2m(r0, *(output_data_bytes));            /* write R0 = r0 */
325     pand_r2r(r2, r3);                              /* r3 = __ __ __ 04 */
326     movq_m2r(*(input_bytes+32), r0);
327     psllq_i2r(16, r3);                             /* r3 = __ __ 04 __ */
328     pmullw_m2r(*(dequant_matrix_bytes+32), r0);    /* r0 = 23 22 21 20 */
329     pand_r2r(r1, r7);                              /* r7 = 13 __ __ __ */
330     por_r2r(r3, r5);                               /* r5 = __ __ 04 01 */
331     por_r2r(r6, r7);                               /* r7 = 13 10 __ __ */
332     movq_m2r(*(input_bytes+24), r3);
333     por_r2r(r5, r7);                               /* r7 = 13 10 04 01 = R1 */
334     pmullw_m2r(*(dequant_matrix_bytes+24), r3);    /* r3 = 17 16 15 14 */
335     psrlq_i2r(16, r4);                             /* r4 = __ 07 06 05 */
336     movq_r2m(r7, *(output_data_bytes+16));         /* write R1 = r7 */
337     movq_r2r(r4, r5);                              /* r5 = __ 07 06 05 */
338     movq_r2r(r0, r7);                              /* r7 = 23 22 21 20 */
339     psrlq_i2r(16, r4);                             /* r4 = __ __ 07 06 */
340     psrlq_i2r(48, r7);                             /* r7 = __ __ __ 23 */
341     movq_r2r(r2, r6);                              /* r6 = __ __ __ FF */
342     pand_r2r(r2, r5);                              /* r5 = __ __ __ 05 */
343     pand_r2r(r4, r6);                              /* r6 = __ __ __ 06 */
344     movq_r2m(r7, *(output_data_bytes+80));      /* partial R9 = __ __ __ 23 */
345     pxor_r2r(r6, r4);                              /* r4 = __ __ 07 __ */
346     psrlq_i2r(32, r1);                             /* r1 = __ __ 13 12 */
347     por_r2r(r5, r4);                               /* r4 = __ __ 07 05 */
348     movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
349     pand_r2r(r2, r1);                              /* r1 = __ __ __ 12 */
350     movq_m2r(*(input_bytes+48), r5);
351     psllq_i2r(16, r0);                             /* r0 = 22 21 20 __ */
352     pmullw_m2r(*(dequant_matrix_bytes+48), r5);    /* r5 = 33 32 31 30 */
353     pand_r2r(r0, r7);                              /* r7 = 22 __ __ __ */
354     movq_r2m(r1, *(output_data_bytes+64));      /* partial R8 = __ __ __ 12 */
355     por_r2r(r4, r7);                               /* r7 = 22 __ 07 05 */
356     movq_r2r(r3, r4);                              /* r4 = 17 16 15 14 */
357     pand_r2r(r2, r3);                              /* r3 = __ __ __ 14 */
358     movq_m2r(*M(2), r1);                           /* r1 = __ FF __ __ */
359     psllq_i2r(32, r3);                             /* r3 = __ 14 __ __ */
360     por_r2r(r3, r7);                               /* r7 = 22 14 07 05 = R2 */
361     movq_r2r(r5, r3);                              /* r3 = 33 32 31 30 */
362     psllq_i2r(48, r3);                             /* r3 = 30 __ __ __ */
363     pand_r2r(r0, r1);                              /* r1 = __ 21 __ __ */
364     movq_r2m(r7, *(output_data_bytes+32));         /* write R2 = r7 */
365     por_r2r(r3, r6);                               /* r6 = 30 __ __ 06 */
366     movq_m2r(*M(1), r7);                           /* r7 = __ __ FF __ */
367     por_r2r(r1, r6);                               /* r6 = 30 21 __ 06 */
368     movq_m2r(*(input_bytes+56), r1);
369     pand_r2r(r4, r7);                              /* r7 = __ __ 15 __ */
370     pmullw_m2r(*(dequant_matrix_bytes+56), r1);    /* r1 = 37 36 35 34 */
371     por_r2r(r6, r7);                               /* r7 = 30 21 15 06 = R3 */
372     pand_m2r(*M(1), r0);                           /* r0 = __ __ 20 __ */
373     psrlq_i2r(32, r4);                             /* r4 = __ __ 17 16 */
374     movq_r2m(r7, *(output_data_bytes+48));         /* write R3 = r7 */
375     movq_r2r(r4, r6);                              /* r6 = __ __ 17 16 */
376     movq_m2r(*M(3), r7);                           /* r7 = FF __ __ __ */
377     pand_r2r(r2, r4);                              /* r4 = __ __ __ 16 */
378     movq_m2r(*M(1), r3);                           /* r3 = __ __ FF __ */
379     pand_r2r(r1, r7);                              /* r7 = 37 __ __ __ */
380     pand_r2r(r5, r3);                              /* r3 = __ __ 31 __ */
381     por_r2r(r4, r0);                               /* r0 = __ __ 20 16 */
382     psllq_i2r(16, r3);                             /* r3 = __ 31 __ __ */
383     por_r2r(r0, r7);                               /* r7 = 37 __ 20 16 */
384     movq_m2r(*M(2), r4);                           /* r4 = __ FF __ __ */
385     por_r2r(r3, r7);                               /* r7 = 37 31 20 16 = R4 */
386     movq_m2r(*(input_bytes+80), r0);
387     movq_r2r(r4, r3);                              /* r3 = __ __ FF __ */
388     pmullw_m2r(*(dequant_matrix_bytes+80), r0);    /* r0 = 53 52 51 50 */
389     pand_r2r(r5, r4);                              /* r4 = __ 32 __ __ */
390     movq_r2m(r7, *(output_data_bytes+8));          /* write R4 = r7 */
391     por_r2r(r4, r6);                               /* r6 = __ 32 17 16 */
392     movq_r2r(r3, r4);                              /* r4 = __ FF __ __ */
393     psrlq_i2r(16, r6);                             /* r6 = __ __ 32 17 */
394     movq_r2r(r0, r7);                              /* r7 = 53 52 51 50 */
395     pand_r2r(r1, r4);                              /* r4 = __ 36 __ __ */
396     psllq_i2r(48, r7);                             /* r7 = 50 __ __ __ */
397     por_r2r(r4, r6);                               /* r6 = __ 36 32 17 */
398     movq_m2r(*(input_bytes+88), r4);
399     por_r2r(r6, r7);                               /* r7 = 50 36 32 17 = R5 */
400     pmullw_m2r(*(dequant_matrix_bytes+88), r4);    /* r4 = 57 56 55 54 */
401     psrlq_i2r(16, r3);                             /* r3 = __ __ FF __ */
402     movq_r2m(r7, *(output_data_bytes+24));         /* write R5 = r7 */
403     pand_r2r(r1, r3);                              /* r3 = __ __ 35 __ */
404     psrlq_i2r(48, r5);                             /* r5 = __ __ __ 33 */
405     pand_r2r(r2, r1);                              /* r1 = __ __ __ 34 */
406     movq_m2r(*(input_bytes+104), r6);
407     por_r2r(r3, r5);                               /* r5 = __ __ 35 33 */
408     pmullw_m2r(*(dequant_matrix_bytes+104), r6);   /* r6 = 67 66 65 64 */
409     psrlq_i2r(16, r0);                             /* r0 = __ 53 52 51 */
410     movq_r2r(r4, r7);                              /* r7 = 57 56 55 54 */
411     movq_r2r(r2, r3);                              /* r3 = __ __ __ FF */
412     psllq_i2r(48, r7);                             /* r7 = 54 __ __ __ */
413     pand_r2r(r0, r3);                              /* r3 = __ __ __ 51 */
414     pxor_r2r(r3, r0);                              /* r0 = __ 53 52 __ */
415     psllq_i2r(32, r3);                             /* r3 = __ 51 __ __ */
416     por_r2r(r5, r7);                               /* r7 = 54 __ 35 33 */
417     movq_r2r(r6, r5);                              /* r5 = 67 66 65 64 */
418     pand_m2r(*M(1), r6);                           /* r6 = __ __ 65 __ */
419     por_r2r(r3, r7);                               /* r7 = 54 51 35 33 = R6 */
420     psllq_i2r(32, r6);                             /* r6 = 65 __ __ __ */
421     por_r2r(r1, r0);                               /* r0 = __ 53 52 34 */
422     movq_r2m(r7, *(output_data_bytes+40));         /* write R6 = r7 */
423     por_r2r(r6, r0);                               /* r0 = 65 53 52 34 = R7 */
424     movq_m2r(*(input_bytes+120), r7);
425     movq_r2r(r5, r6);                              /* r6 = 67 66 65 64 */
426     pmullw_m2r(*(dequant_matrix_bytes+120), r7);   /* r7 = 77 76 75 74 */
427     psrlq_i2r(32, r5);                             /* r5 = __ __ 67 66 */
428     pand_r2r(r2, r6);                              /* r6 = __ __ __ 64 */
429     movq_r2r(r5, r1);                              /* r1 = __ __ 67 66 */
430     movq_r2m(r0, *(output_data_bytes+56));         /* write R7 = r0 */
431     pand_r2r(r2, r1);                              /* r1 = __ __ __ 66 */
432     movq_m2r(*(input_bytes+112), r0);
433     movq_r2r(r7, r3);                              /* r3 = 77 76 75 74 */
434     pmullw_m2r(*(dequant_matrix_bytes+112), r0);   /* r0 = 73 72 71 70 */
435     psllq_i2r(16, r3);                             /* r3 = 76 75 74 __ */
436     pand_m2r(*M(3), r7);                           /* r7 = 77 __ __ __ */
437     pxor_r2r(r1, r5);                              /* r5 = __ __ 67 __ */
438     por_r2r(r5, r6);                               /* r6 = __ __ 67 64 */
439     movq_r2r(r3, r5);                              /* r5 = 76 75 74 __ */
440     pand_m2r(*M(3), r5);                           /* r5 = 76 __ __ __ */
441     por_r2r(r1, r7);                               /* r7 = 77 __ __ 66 */
442     movq_m2r(*(input_bytes+96), r1);
443     pxor_r2r(r5, r3);                              /* r3 = __ 75 74 __ */
444     pmullw_m2r(*(dequant_matrix_bytes+96), r1);    /* r1 = 63 62 61 60 */
445     por_r2r(r3, r7);                               /* r7 = 77 75 74 66 = R15 */
446     por_r2r(r5, r6);                               /* r6 = 76 __ 67 64 */
447     movq_r2r(r0, r5);                              /* r5 = 73 72 71 70 */
448     movq_r2m(r7, *(output_data_bytes+120));        /* store R15 = r7 */
449     psrlq_i2r(16, r5);                             /* r5 = __ 73 72 71 */
450     pand_m2r(*M(2), r5);                           /* r5 = __ 73 __ __ */
451     movq_r2r(r0, r7);                              /* r7 = 73 72 71 70 */
452     por_r2r(r5, r6);                               /* r6 = 76 73 67 64 = R14 */
453     pand_r2r(r2, r0);                              /* r0 = __ __ __ 70 */
454     pxor_r2r(r0, r7);                              /* r7 = 73 72 71 __ */
455     psllq_i2r(32, r0);                             /* r0 = __ 70 __ __ */
456     movq_r2m(r6, *(output_data_bytes+104));        /* write R14 = r6 */
457     psrlq_i2r(16, r4);                             /* r4 = __ 57 56 55 */
458     movq_m2r(*(input_bytes+72), r5);
459     psllq_i2r(16, r7);                             /* r7 = 72 71 __ __ */
460     pmullw_m2r(*(dequant_matrix_bytes+72), r5);    /* r5 = 47 46 45 44 */
461     movq_r2r(r7, r6);                              /* r6 = 72 71 __ __ */
462     movq_m2r(*M(2), r3);                           /* r3 = __ FF __ __ */
463     psllq_i2r(16, r6);                             /* r6 = 71 __ __ __ */
464     pand_m2r(*M(3), r7);                           /* r7 = 72 __ __ __ */
465     pand_r2r(r1, r3);                              /* r3 = __ 62 __ __ */
466     por_r2r(r0, r7);                               /* r7 = 72 70 __ __ */
467     movq_r2r(r1, r0);                              /* r0 = 63 62 61 60 */
468     pand_m2r(*M(3), r1);                           /* r1 = 63 __ __ __ */
469     por_r2r(r3, r6);                               /* r6 = 71 62 __ __ */
470     movq_r2r(r4, r3);                              /* r3 = __ 57 56 55 */
471     psrlq_i2r(32, r1);                             /* r1 = __ __ 63 __ */
472     pand_r2r(r2, r3);                              /* r3 = __ __ __ 55 */
473     por_r2r(r1, r7);                               /* r7 = 72 70 63 __ */
474     por_r2r(r3, r7);                               /* r7 = 72 70 63 55 = R13 */
475     movq_r2r(r4, r3);                              /* r3 = __ 57 56 55 */
476     pand_m2r(*M(1), r3);                           /* r3 = __ __ 56 __ */
477     movq_r2r(r5, r1);                              /* r1 = 47 46 45 44 */
478     movq_r2m(r7, *(output_data_bytes+88));         /* write R13 = r7 */
479     psrlq_i2r(48, r5);                             /* r5 = __ __ __ 47 */
480     movq_m2r(*(input_bytes+64), r7);
481     por_r2r(r3, r6);                               /* r6 = 71 62 56 __ */
482     pmullw_m2r(*(dequant_matrix_bytes+64), r7);    /* r7 = 43 42 41 40 */
483     por_r2r(r5, r6);                               /* r6 = 71 62 56 47 = R12 */
484     pand_m2r(*M(2), r4);                           /* r4 = __ 57 __ __ */
485     psllq_i2r(32, r0);                             /* r0 = 61 60 __ __ */
486     movq_r2m(r6, *(output_data_bytes+72));         /* write R12 = r6 */
487     movq_r2r(r0, r6);                              /* r6 = 61 60 __ __ */
488     pand_m2r(*M(3), r0);                           /* r0 = 61 __ __ __ */
489     psllq_i2r(16, r6);                             /* r6 = 60 __ __ __ */
490     movq_m2r(*(input_bytes+40), r5);
491     movq_r2r(r1, r3);                              /* r3 = 47 46 45 44 */
492     pmullw_m2r(*(dequant_matrix_bytes+40), r5);    /* r5 = 27 26 25 24 */
493     psrlq_i2r(16, r1);                             /* r1 = __ 47 46 45 */
494     pand_m2r(*M(1), r1);                           /* r1 = __ __ 46 __ */
495     por_r2r(r4, r0);                               /* r0 = 61 57 __ __ */
496     pand_r2r(r7, r2);                              /* r2 = __ __ __ 40 */
497     por_r2r(r1, r0);                               /* r0 = 61 57 46 __ */
498     por_r2r(r2, r0);                               /* r0 = 61 57 46 40 = R11 */
499     psllq_i2r(16, r3);                             /* r3 = 46 45 44 __ */
500     movq_r2r(r3, r4);                              /* r4 = 46 45 44 __ */
501     movq_r2r(r5, r2);                              /* r2 = 27 26 25 24 */
502     movq_r2m(r0, *(output_data_bytes+112));        /* write R11 = r0 */
503     psrlq_i2r(48, r2);                             /* r2 = __ __ __ 27 */
504     pand_m2r(*M(2), r4);                           /* r4 = __ 45 __ __ */
505     por_r2r(r2, r6);                               /* r6 = 60 __ __ 27 */
506     movq_m2r(*M(1), r2);                           /* r2 = __ __ FF __ */
507     por_r2r(r4, r6);                               /* r6 = 60 45 __ 27 */
508     pand_r2r(r7, r2);                              /* r2 = __ __ 41 __ */
509     psllq_i2r(32, r3);                             /* r3 = 44 __ __ __ */
510     por_m2r(*(output_data_bytes+80), r3);          /* r3 = 44 __ __ 23 */
511     por_r2r(r2, r6);                               /* r6 = 60 45 41 27 = R10 */
512     movq_m2r(*M(3), r2);                           /* r2 = FF __ __ __ */
513     psllq_i2r(16, r5);                             /* r5 = 26 25 24 __ */
514     movq_r2m(r6, *(output_data_bytes+96));         /* store R10 = r6 */
515     pand_r2r(r5, r2);                              /* r2 = 26 __ __ __ */
516     movq_m2r(*M(2), r6);                           /* r6 = __ FF __ __ */
517     pxor_r2r(r2, r5);                              /* r5 = __ 25 24 __ */
518     pand_r2r(r7, r6);                              /* r6 = __ 42 __ __ */
519     psrlq_i2r(32, r2);                             /* r2 = __ __ 26 __ */
520     pand_m2r(*M(3), r7);                           /* r7 = 43 __ __ __ */
521     por_r2r(r2, r3);                               /* r3 = 44 __ 26 23 */
522     por_m2r(*(output_data_bytes+64), r7);          /* r7 = 43 __ __ 12 */
523     por_r2r(r3, r6);                               /* r6 = 44 42 26 23 = R9 */
524     por_r2r(r5, r7);                               /* r7 = 43 25 24 12 = R8 */
525     movq_r2m(r6, *(output_data_bytes+80));         /* store R9 = r6 */
526     movq_r2m(r7, *(output_data_bytes+64));         /* store R8 = r7 */
527
528
529 #undef M
530
531     /* at this point, function has completed dequantization + dezigzag +
532      * partial transposition; now do the idct itself */
533
534 #define I(K) (output_data + K * 8)
535 #define J(K) (output_data + ((K - 4) * 8) + 4)
536
537     RowIDCT();
538     Transpose();
539
540 #undef I
541 #undef J
542 #define I(K) (output_data + (K * 8) + 32)
543 #define J(K) (output_data + ((K - 4) * 8) + 36)
544
545     RowIDCT();
546     Transpose();
547
548 #undef I
549 #undef J
550 #define I(K) (output_data + K * 8)
551 #define J(K) (output_data + K * 8)
552
553     ColumnIDCT();
554
555 #undef I
556 #undef J
557 #define I(K) (output_data + (K * 8) + 4)
558 #define J(K) (output_data + (K * 8) + 4)
559
560     ColumnIDCT();
561
562 #undef I
563 #undef J
564
565 }
566
567 void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
568     int coeff_count, uint8_t *dest, int stride)
569 {
570     int16_t transformed_data[64];
571     int16_t *op;
572     int i, j;
573     uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
574
575     vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
576
577     /* place in final output */
578     op = transformed_data;
579     movq_m2r(*vector128, mm0);
580     for (i = 0; i < 8; i++) {
581 #if 1
582         for (j = 0; j < 8; j++) {
583             if (*op < -128)
584                 *dest = 0;
585             else if (*op > 127)
586                 *dest = 255;
587             else
588                 *dest = (uint8_t)(*op + 128);
589             op++;
590             dest++;
591         }
592         dest += (stride - 8);
593 #else
594 /* prototype optimization */
595         pxor_r2r(mm1, mm1);
596         packsswb_m2r(*(op + 4), mm1);
597         movq_r2r(mm1, mm2);
598         psrlq_i2r(32, mm2);
599         packsswb_m2r(*(op + 0), mm1);
600         op += 8;
601         por_r2r(mm2, mm1);
602         paddb_r2r(mm0, mm1);
603         movq_r2m(mm1, *dest);
604         dest += stride;
605 #endif
606     }
607
608     /* be a good MMX citizen */
609     emms();
610 }
611
612 void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
613     int coeff_count, uint8_t *dest, int stride)
614 {
615     int16_t transformed_data[64];
616     int16_t *op;
617     int i, j;
618     int16_t sample;
619
620     vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
621
622     /* place in final output */
623     op = transformed_data;
624     for (i = 0; i < 8; i++) {
625         for (j = 0; j < 8; j++) {
626             sample = *dest + *op;
627             if (sample < 0)
628                 *dest = 0;
629             else if (sample > 255)
630                 *dest = 255;
631             else
632                 *dest = (uint8_t)(sample & 0xFF);
633             op++;
634             dest++;
635         }
636         dest += (stride - 8);
637     }
638
639     /* be a good MMX citizen */
640     emms();
641 }