1 ; XVID MPEG-4 VIDEO CODEC
3 ; Conversion from gcc syntax to x264asm syntax with modifications
4 ; by Christophe Gisquet <christophe.gisquet@gmail.com>
6 ; =========== SSE2 inverse discrete cosine transform ===========
8 ; Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
10 ; Conversion to gcc syntax with modifications
11 ; by Alexander Strange <astrange@ithinksw.com>
13 ; Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
15 ; Vertical pass is an implementation of the scheme:
16 ; Loeffler C., Ligtenberg A., and Moschytz C.S.:
17 ; Practical Fast 1D DCT Algorithm with Eleven Multiplications,
18 ; Proc. ICASSP 1989, 988-991.
20 ; Horizontal pass is a double 4x4 vector/matrix multiplication,
21 ; (see also Intel's Application Note 922:
22 ; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
23 ; Copyright (C) 1999 Intel Corporation)
25 ; More details at http://skal.planet-d.net/coding/dct.html
27 ; ======= MMX and XMM forward discrete cosine transform =======
29 ; Copyright(C) 2001 Peter Ross <pross@xvid.org>
31 ; Originally provided by Intel at AP-922
32 ; http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
33 ; (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
34 ; but in a limited edition.
35 ; New macro implements a column part for precise iDCT
36 ; The routine precision now satisfies IEEE standard 1180-1990.
38 ; Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
39 ; Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
41 ; http://www.elecard.com/peter/idct.html
42 ; http://www.linuxvideo.org/mpeg2dec/
44 ; These examples contain code fragments for first stage iDCT 8x8
45 ; (for rows) and first stage DCT 8x8 (for columns)
47 ; conversion to gcc syntax by Michael Niedermayer
49 ; ======================================================================
51 ; This file is part of FFmpeg.
53 ; FFmpeg is free software; you can redistribute it and/or
54 ; modify it under the terms of the GNU Lesser General Public
55 ; License as published by the Free Software Foundation; either
56 ; version 2.1 of the License, or (at your option) any later version.
58 ; FFmpeg is distributed in the hope that it will be useful,
59 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
60 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
61 ; Lesser General Public License for more details.
63 ; You should have received a copy of the GNU Lesser General Public License
64 ; along with FFmpeg; if not, write to the Free Software Foundation,
65 ; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
67 %include "libavutil/x86/x86util.asm"
70 ; Similar to tg_1_16 in MMX code
71 tan1: times 8 dw 13036
72 tan2: times 8 dw 27146
73 tan3: times 8 dw 43790
74 sqrt2: times 8 dw 23170
77 iTab1: dw 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d
78 dw 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61
79 dw 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7
80 dw 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
81 iTab2: dw 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5
82 dw 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04
83 dw 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41
84 dw 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
85 iTab3: dw 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf
86 dw 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf
87 dw 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d
88 dw 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
89 iTab4: dw 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746
90 dw 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac
91 dw 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df
92 dw 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
95 ; -----------------------------------------------------------------------------
97 ; The first stage iDCT 8x8 - inverse DCTs of rows
99 ; -----------------------------------------------------------------------------
100 ; The 8-point inverse DCT direct algorithm
101 ; -----------------------------------------------------------------------------
103 ; static const short w[32] = {
104 ; FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
105 ; FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
106 ; FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
107 ; FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
108 ; FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
109 ; FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
110 ; FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
111 ; FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
113 ; #define DCT_8_INV_ROW(x, y)
115 ; int a0, a1, a2, a3, b0, b1, b2, b3;
117 ; a0 = x[0] * w[0] + x[2] * w[1] + x[4] * w[2] + x[6] * w[3];
118 ; a1 = x[0] * w[4] + x[2] * w[5] + x[4] * w[6] + x[6] * w[7];
119 ; a2 = x[0] * w[8] + x[2] * w[9] + x[4] * w[10] + x[6] * w[11];
120 ; a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
121 ; b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
122 ; b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
123 ; b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
124 ; b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
126 ; y[0] = SHIFT_ROUND(a0 + b0);
127 ; y[1] = SHIFT_ROUND(a1 + b1);
128 ; y[2] = SHIFT_ROUND(a2 + b2);
129 ; y[3] = SHIFT_ROUND(a3 + b3);
130 ; y[4] = SHIFT_ROUND(a3 - b3);
131 ; y[5] = SHIFT_ROUND(a2 - b2);
132 ; y[6] = SHIFT_ROUND(a1 - b1);
133 ; y[7] = SHIFT_ROUND(a0 - b0);
136 ; -----------------------------------------------------------------------------
138 ; In this implementation the outputs of the iDCT-1D are multiplied
139 ; for rows 0,4 - by cos_4_16,
140 ; for rows 1,7 - by cos_1_16,
141 ; for rows 2,6 - by cos_2_16,
142 ; for rows 3,5 - by cos_3_16
143 ; and are shifted to the left for better accuracy.
145 ; For the constants used,
146 ; FIX(float_const) = (short) (float_const * (1 << 15) + 0.5)
148 ; -----------------------------------------------------------------------------
150 ; -----------------------------------------------------------------------------
151 ; Tables for mmx processors
152 ; -----------------------------------------------------------------------------
154 ; Table for rows 0,4 - constants are multiplied by cos_4_16
155 tab_i_04_mmx: dw 16384, 16384, 16384, -16384
156 dw 21407, 8867, 8867, -21407 ; w07 w05 w03 w01
157 dw 16384, -16384, 16384, 16384 ; w14 w12 w10 w08
158 dw -8867, 21407, -21407, -8867 ; w15 w13 w11 w09
159 dw 22725, 12873, 19266, -22725 ; w22 w20 w18 w16
160 dw 19266, 4520, -4520, -12873 ; w23 w21 w19 w17
161 dw 12873, 4520, 4520, 19266 ; w30 w28 w26 w24
162 dw -22725, 19266, -12873, -22725 ; w31 w29 w27 w25
163 ; Table for rows 1,7 - constants are multiplied by cos_1_16
164 dw 22725, 22725, 22725, -22725 ; movq-> w06 w04 w02 w00
165 dw 29692, 12299, 12299, -29692 ; w07 w05 w03 w01
166 dw 22725, -22725, 22725, 22725 ; w14 w12 w10 w08
167 dw -12299, 29692, -29692, -12299 ; w15 w13 w11 w09
168 dw 31521, 17855, 26722, -31521 ; w22 w20 w18 w16
169 dw 26722, 6270, -6270, -17855 ; w23 w21 w19 w17
170 dw 17855, 6270, 6270, 26722 ; w30 w28 w26 w24
171 dw -31521, 26722, -17855, -31521 ; w31 w29 w27 w25
172 ; Table for rows 2,6 - constants are multiplied by cos_2_16
173 dw 21407, 21407, 21407, -21407 ; movq-> w06 w04 w02 w00
174 dw 27969, 11585, 11585, -27969 ; w07 w05 w03 w01
175 dw 21407, -21407, 21407, 21407 ; w14 w12 w10 w08
176 dw -11585, 27969, -27969, -11585 ; w15 w13 w11 w09
177 dw 29692, 16819, 25172, -29692 ; w22 w20 w18 w16
178 dw 25172, 5906, -5906, -16819 ; w23 w21 w19 w17
179 dw 16819, 5906, 5906, 25172 ; w30 w28 w26 w24
180 dw -29692, 25172, -16819, -29692 ; w31 w29 w27 w25
181 ; Table for rows 3,5 - constants are multiplied by cos_3_16
182 dw 19266, 19266, 19266, -19266 ; movq-> w06 w04 w02 w00
183 dw 25172, 10426, 10426, -25172 ; w07 w05 w03 w01
184 dw 19266, -19266, 19266, 19266 ; w14 w12 w10 w08
185 dw -10426, 25172, -25172, -10426 ; w15 w13 w11 w09
186 dw 26722, 15137, 22654, -26722 ; w22 w20 w18 w16
187 dw 22654, 5315, -5315, -15137 ; w23 w21 w19 w17
188 dw 15137, 5315, 5315, 22654 ; w30 w28 w26 w24
189 dw -26722, 22654, -15137, -26722 ; w31 w29 w27 w25
191 ; -----------------------------------------------------------------------------
192 ; Tables for xmm processors
193 ; -----------------------------------------------------------------------------
195 ; %3 for rows 0,4 - constants are multiplied by cos_4_16
196 tab_i_04_xmm: dw 16384, 21407, 16384, 8867 ; movq-> w05 w04 w01 w00
197 dw 16384, 8867, -16384, -21407 ; w07 w06 w03 w02
198 dw 16384, -8867, 16384, -21407 ; w13 w12 w09 w08
199 dw -16384, 21407, 16384, -8867 ; w15 w14 w11 w10
200 dw 22725, 19266, 19266, -4520 ; w21 w20 w17 w16
201 dw 12873, 4520, -22725, -12873 ; w23 w22 w19 w18
202 dw 12873, -22725, 4520, -12873 ; w29 w28 w25 w24
203 dw 4520, 19266, 19266, -22725 ; w31 w30 w27 w26
204 ; %3 for rows 1,7 - constants are multiplied by cos_1_16
205 dw 22725, 29692, 22725, 12299 ; movq-> w05 w04 w01 w00
206 dw 22725, 12299, -22725, -29692 ; w07 w06 w03 w02
207 dw 22725, -12299, 22725, -29692 ; w13 w12 w09 w08
208 dw -22725, 29692, 22725, -12299 ; w15 w14 w11 w10
209 dw 31521, 26722, 26722, -6270 ; w21 w20 w17 w16
210 dw 17855, 6270, -31521, -17855 ; w23 w22 w19 w18
211 dw 17855, -31521, 6270, -17855 ; w29 w28 w25 w24
212 dw 6270, 26722, 26722, -31521 ; w31 w30 w27 w26
213 ; %3 for rows 2,6 - constants are multiplied by cos_2_16
214 dw 21407, 27969, 21407, 11585 ; movq-> w05 w04 w01 w00
215 dw 21407, 11585, -21407, -27969 ; w07 w06 w03 w02
216 dw 21407, -11585, 21407, -27969 ; w13 w12 w09 w08
217 dw -21407, 27969, 21407, -11585 ; w15 w14 w11 w10
218 dw 29692, 25172, 25172, -5906 ; w21 w20 w17 w16
219 dw 16819, 5906, -29692, -16819 ; w23 w22 w19 w18
220 dw 16819, -29692, 5906, -16819 ; w29 w28 w25 w24
221 dw 5906, 25172, 25172, -29692 ; w31 w30 w27 w26
222 ; %3 for rows 3,5 - constants are multiplied by cos_3_16
223 dw 19266, 25172, 19266, 10426 ; movq-> w05 w04 w01 w00
224 dw 19266, 10426, -19266, -25172 ; w07 w06 w03 w02
225 dw 19266, -10426, 19266, -25172 ; w13 w12 w09 w08
226 dw -19266, 25172, 19266, -10426 ; w15 w14 w11 w10
227 dw 26722, 22654, 22654, -5315 ; w21 w20 w17 w16
228 dw 15137, 5315, -26722, -15137 ; w23 w22 w19 w18
229 dw 15137, -26722, 5315, -15137 ; w29 w28 w25 w24
230 dw 5315, 22654, 22654, -26722 ; w31 w30 w27 w26
231 %endif ; ~ARCH_X86_32
233 ; Similar to rounder_0 in MMX code
234 ; 4 first similar, then: 4*8->6*16 5*8->4*16 6/7*8->5*16
235 walkenIdctRounders: times 4 dd 65536
243 pb_127: times 8 db 127
247 ; Temporary storage before the column pass
257 pshufhw %1, xmm2, 0x1B
276 pshufhw xmm2, xmm2, 0x1B
295 %define ROW0 [BLOCK + 0*16]
297 %define ROW2 [BLOCK + 2*16]
299 %define ROW4 [BLOCK + 4*16]
301 %define ROW6 [BLOCK + 6*16]
319 %macro TEST_ONE_ROW 4 ; src, reg, clear, arg
327 ;row1, row2, reg1, reg2, clear1, arg1, clear2, arg2
328 %macro TEST_TWO_ROWS 8
342 %macro iMTX_MULT 4-5 ; src, table, put, arg, rounder
345 pshufd xmm1, xmm3, 0x11 ; 4602
346 punpcklqdq xmm0, xmm0 ; 0246
348 pmaddwd xmm1, [%2+16]
349 pshufd xmm2, xmm3, 0xBB ; 5713
350 punpckhqdq xmm3, xmm3 ; 1357
351 pmaddwd xmm2, [%2+32]
352 pmaddwd xmm3, [%2+48]
356 paddd xmm0, [walkenIdctRounders+%5]
372 %macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put)
377 ; dct coeffs must still be written for AC prediction
379 movdqa [%1+1*16], TAN3
380 movdqa [%1+2*16], xmm3
381 movdqa [%1+5*16], REG0
382 movdqa [%1+6*16], xmm5
384 ; Must now load args as gprs are no longer used for masks
385 ; DEST is set to where address of dest was loaded
387 %if %2 == 2 ; Not enough xmms, store
388 movdqa [%1+1*16], TAN3
389 movdqa [%1+2*16], xmm3
390 movdqa [%1+5*16], REG0
391 movdqa [%1+6*16], xmm5
393 %xdefine DEST r2q ; BLOCK is r0, stride r1
394 movifnidn DEST, destm
395 movifnidn strideq, stridem
403 movq [DEST + strideq], TAN3
404 movhps [DEST + 2*strideq], TAN3
405 ; REG0 and TAN3 are now available (and likely used in second half)
410 %macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms
415 ; dct coeffs must still be written for AC prediction
424 ; address of dest may have been loaded
426 movhps [DEST + r3q], %3
427 lea DEST, [DEST + 4*strideq]
429 movhps [DEST + r3q], %6
430 ; and now write remainder of first half
431 movq [DEST + 2*strideq], xmm5
432 movhps [DEST + strideq], xmm5
436 ; free: m3 REG0=m4 m5
437 ; input: m1, m7, m2, m6
438 movq xmm3, [DEST+0*strideq]
439 movq xmm4, [DEST+1*strideq]
443 paddsw xmm4, [%1 + 1*16]
444 movq %3, [DEST+2*strideq]
445 movq xmm5, [DEST+ r3q]
448 paddsw %3, [%1 + 2*16]
452 movq [DEST+0*strideq], xmm3
453 movhps [DEST+1*strideq], xmm3
454 movq [DEST+2*strideq], %3
455 movhps [DEST+ r3q], %3
456 lea DEST, [DEST+4*strideq]
457 movq xmm3, [DEST+0*strideq]
458 movq xmm4, [DEST+1*strideq]
459 movq %3, [DEST+2*strideq]
460 movq xmm5, [DEST+ r3q]
466 paddsw xmm4, [%1 + 5*16]
467 paddsw %3, [%1 + 6*16]
471 movq [DEST+0*strideq], xmm3
472 movhps [DEST+1*strideq], xmm3
473 movq [DEST+2*strideq], %3
474 movhps [DEST+ r3q], %3
476 ; l1:TAN3=m13 l2:m3 l5:REG0=m8 l6=m5
477 ; input: m1, m7/SREG2=m9, TAN1=m14, REG4=m10
478 movq xmm2, [DEST+0*strideq]
479 movq xmm4, [DEST+1*strideq]
480 movq xmm12, [DEST+2*strideq]
481 movq xmm11, [DEST+ r3q]
484 punpcklbw xmm12, xmm0
485 punpcklbw xmm11, xmm0
491 packuswb xmm12, xmm11
492 movq [DEST+0*strideq], xmm2
493 movhps [DEST+1*strideq], xmm2
494 movq [DEST+2*strideq], xmm12
495 movhps [DEST+ r3q], xmm12
496 lea DEST, [DEST+4*strideq]
497 movq xmm2, [DEST+0*strideq]
498 movq xmm4, [DEST+1*strideq]
499 movq xmm12, [DEST+2*strideq]
500 movq xmm11, [DEST+ r3q]
503 punpcklbw xmm12, xmm0
504 punpcklbw xmm11, xmm0
510 packuswb xmm12, xmm11
511 movq [DEST+0*strideq], xmm2
512 movhps [DEST+1*strideq], xmm2
513 movq [DEST+2*strideq], xmm12
514 movhps [DEST+ r3q], xmm12
520 ; IDCT pass on columns.
521 %macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put)
585 SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
588 ; IDCT pass on columns, assuming rows 4-7 are zero
589 %macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add)
636 SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
639 %macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
640 %if %1 == 0 || ARCH_X86_32
654 cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
655 %xdefine BLOCK blockq
658 cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
660 cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
663 %xdefine BLOCK blockq
670 iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
671 iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
672 iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
674 TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
676 iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
678 TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
679 TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
686 iLLM_PASS_SPARSE BLOCK, %1
689 iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
691 iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
694 iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
698 iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
715 ; %1=offset %2=tab_offset
716 ; %3=rnd_offset where 4*8->6*16 5*8->4*16 6/7*8->5*16
717 %macro DCT_8_INV_ROW 3
718 movq mm0, [r0+16*%1+0] ; 0 ; x3 x2 x1 x0
719 movq mm1, [r0+16*%1+8] ; 1 ; x7 x6 x5 x4
720 movq mm2, mm0 ; 2 ; x3 x2 x1 x0
721 movq mm3, [%2+ 0] ; 3 ; w06 w04 w02 w00
723 pshufw mm0, mm0, 0x88 ; x2 x0 x2 x0
724 movq mm4, [%2+ 8] ; 4 ; w07 w06 w03 w02
725 movq mm5, mm1 ; 5 ; x7 x6 x5 x4
726 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00
727 movq mm6, [%2+32] ; 6 ; w21 w20 w17 w16
728 pshufw mm1, mm1, 0x88 ; x6 x4 x6 x4
729 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02
730 movq mm7, [%2+40] ; 7; w23 w22 w19 w18
731 pshufw mm2, mm2, 0xdd ; x3 x1 x3 x1
732 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16
733 pshufw mm5, mm5, 0xdd ; x7 x5 x7 x5
734 pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18
735 paddd mm3, [walkenIdctRounders + %3] ; +%3
736 pmaddwd mm0, [%2+16] ; x2*w13+x0*w12 x2*w09+x0*w08
737 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0)
738 pmaddwd mm1, [%2+24] ; x6*w15+x4*w14 x6*w11+x4*w10
739 movq mm4, mm3 ; 4 ; a1 a0
740 pmaddwd mm2, [%2+48] ; x3*w29+x1*w28 x3*w25+x1*w24
741 paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0)
742 pmaddwd mm5, [%2+56] ; x7*w31+x5*w30 x7*w27+x5*w26
743 paddd mm3, mm6 ; a1+b1 a0+b0
744 paddd mm0, [walkenIdctRounders + %3] ; +%3
745 psrad mm3, 11 ; y1=a1+b1 y0=a0+b0
746 paddd mm0, mm1 ; 1 ; a3=sum(even3) a2=sum(even2)
747 psubd mm4, mm6 ; 6 ; a1-b1 a0-b0
748 movq mm7, mm0 ; 7 ; a3 a2
749 paddd mm2, mm5 ; 5 ; b3=sum(odd3) b2=sum(odd2)
750 paddd mm0, mm2 ; a3+b3 a2+b2
751 psrad mm4, 11 ; y6=a1-b1 y7=a0-b0
752 psubd mm7, mm2 ; 2 ; a3-b3 a2-b2
753 psrad mm0, 11 ; y3=a3+b3 y2=a2+b2
754 psrad mm7, 11 ; y4=a3-b3 y5=a2-b2
755 packssdw mm3, mm0 ; 0 ; y3 y2 y1 y0
756 packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5
757 movq [r0+16*%1+0], mm3 ; 3 ; save y3 y2 y1 y0
758 pshufw mm7, mm7, 0xb1 ; y7 y6 y5 y4
760 punpcklwd mm0, mm1 ; x5 x1 x4 x0
761 movq mm5, mm0 ; 5 ; x5 x1 x4 x0
762 punpckldq mm0, mm0 ; x4 x0 x4 x0
763 movq mm4, [%2+ 8] ; 4 ; w07 w05 w03 w01
764 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2
765 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00
766 movq mm6, mm2 ; 6 ; x7 x3 x6 x2
767 movq mm1, [%2+32] ; 1 ; w22 w20 w18 w16
768 punpckldq mm2, mm2 ; x6 x2 x6 x2
769 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01
770 punpckhdq mm5, mm5 ; x5 x1 x5 x1
771 pmaddwd mm0, [%2+16] ; x4*w14+x0*w12 x4*w10+x0*w08
772 punpckhdq mm6, mm6 ; x7 x3 x7 x3
773 movq mm7, [%2+40] ; 7 ; w23 w21 w19 w17
774 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16
775 paddd mm3, [walkenIdctRounders + %3] ; +%3
776 pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17
777 pmaddwd mm2, [%2+24] ; x6*w15+x2*w13 x6*w11+x2*w09
778 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0)
779 pmaddwd mm5, [%2+48] ; x5*w30+x1*w28 x5*w26+x1*w24
780 movq mm4, mm3 ; 4 ; a1 a0
781 pmaddwd mm6, [%2+56] ; x7*w31+x3*w29 x7*w27+x3*w25
782 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0)
783 paddd mm0, [walkenIdctRounders + %3] ; +%3
784 psubd mm3, mm1 ; a1-b1 a0-b0
785 psrad mm3, 11 ; y6=a1-b1 y7=a0-b0
786 paddd mm1, mm4 ; 4 ; a1+b1 a0+b0
787 paddd mm0, mm2 ; 2 ; a3=sum(even3) a2=sum(even2)
788 psrad mm1, 11 ; y1=a1+b1 y0=a0+b0
789 paddd mm5, mm6 ; 6 ; b3=sum(odd3) b2=sum(odd2)
790 movq mm4, mm0 ; 4 ; a3 a2
791 paddd mm0, mm5 ; a3+b3 a2+b2
792 psubd mm4, mm5 ; 5 ; a3-b3 a2-b2
793 psrad mm0, 11 ; y3=a3+b3 y2=a2+b2
794 psrad mm4, 11 ; y4=a3-b3 y5=a2-b2
795 packssdw mm1, mm0 ; 0 ; y3 y2 y1 y0
796 packssdw mm4, mm3 ; 3 ; y6 y7 y4 y5
797 movq mm7, mm4 ; 7 ; y6 y7 y4 y5
798 psrld mm4, 16 ; 0 y6 0 y4
799 pslld mm7, 16 ; y7 0 y5 0
800 movq [r0+16*%1+0], mm1 ; 1 ; save y3 y2 y1 y0
801 por mm7, mm4 ; 4 ; y7 y6 y5 y4
803 movq [r0+16*%1+8], mm7 ; 7 ; save y7 y6 y5 y4
806 ; -----------------------------------------------------------------------------
808 ; The first stage DCT 8x8 - forward DCTs of columns
810 ; The %2puts are multiplied
811 ; for rows 0,4 - on cos_4_16,
812 ; for rows 1,7 - on cos_1_16,
813 ; for rows 2,6 - on cos_2_16,
814 ; for rows 3,5 - on cos_3_16
815 ; and are shifted to the left for rise of accuracy
817 ; -----------------------------------------------------------------------------
819 ; The 8-point scaled forward DCT algorithm (26a8m)
821 ; -----------------------------------------------------------------------------
823 ;#define DCT_8_FRW_COL(x, y)
825 ; short t0, t1, t2, t3, t4, t5, t6, t7;
826 ; short tp03, tm03, tp12, tm12, tp65, tm65;
827 ; short tp465, tm465, tp765, tm765;
829 ; t0 = LEFT_SHIFT(x[0] + x[7]);
830 ; t1 = LEFT_SHIFT(x[1] + x[6]);
831 ; t2 = LEFT_SHIFT(x[2] + x[5]);
832 ; t3 = LEFT_SHIFT(x[3] + x[4]);
833 ; t4 = LEFT_SHIFT(x[3] - x[4]);
834 ; t5 = LEFT_SHIFT(x[2] - x[5]);
835 ; t6 = LEFT_SHIFT(x[1] - x[6]);
836 ; t7 = LEFT_SHIFT(x[0] - x[7]);
843 ; y[0] = tp03 + tp12;
844 ; y[4] = tp03 - tp12;
846 ; y[2] = tm03 + tm12 * tg_2_16;
847 ; y[6] = tm03 * tg_2_16 - tm12;
849 ; tp65 = (t6 + t5) * cos_4_16;
850 ; tm65 = (t6 - t5) * cos_4_16;
857 ; y[1] = tp765 + tp465 * tg_1_16;
858 ; y[7] = tp765 * tg_1_16 - tp465;
859 ; y[5] = tm765 * tg_3_16 + tm465;
860 ; y[3] = tm765 - tm465 * tg_3_16;
863 ; -----------------------------------------------------------------------------
865 ; -----------------------------------------------------------------------------
866 ; DCT_8_INV_COL_4 INP,OUT
867 ; -----------------------------------------------------------------------------
868 %macro DCT_8_INV_COL 1
871 movq mm1, mm0 ; tg_3_16
873 pmulhw mm0, mm3 ; x3*(tg_3_16-1)
875 pmulhw mm1, mm5 ; x5*(tg_3_16-1)
877 movq mm2, mm4 ; tg_1_16
879 pmulhw mm4, mm7 ; x7*tg_1_16
880 paddsw mm0, mm3 ; x3*tg_3_16
881 pmulhw mm2, mm6 ; x1*tg_1_16
882 paddsw mm1, mm3 ; x3+x5*(tg_3_16-1)
883 psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35
885 paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35
886 paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17
887 psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17
890 paddsw mm5, mm1 ; tp17+tp35 = b0
891 psubsw mm6, mm0 ; tm17-tm35 = b3
892 psubsw mm4, mm1 ; tp17-tp35 = t1
893 paddsw mm2, mm0 ; tm17+tm35 = t2
896 movq [%1+3*16], mm5 ; save b0
897 paddsw mm1, mm2 ; t1+t2
898 movq [%1+5*16], mm6 ; save b3
899 psubsw mm4, mm2 ; t1-t2
901 movq mm0, mm7 ; tg_2_16
903 pmulhw mm0, mm5 ; x2*tg_2_16
904 pmulhw mm7, mm6 ; x6*tg_2_16
905 pmulhw mm1, mm3 ; ocos_4_16*(t1+t2) = b1/2
907 pmulhw mm4, mm3 ; ocos_4_16*(t1-t2) = b2/2
908 psubsw mm0, mm6 ; t2*tg_2_16-x6 = tm26
911 paddsw mm7, mm5 ; x2+x6*tg_2_16 = tp26
912 paddsw mm2, mm6 ; x0+x4 = tp04
913 psubsw mm3, mm6 ; x0-x4 = tm04
916 psubsw mm2, mm7 ; tp04-tp26 = a3
917 paddsw mm3, mm0 ; tm04+tm26 = a1
920 paddsw mm5, mm7 ; tp04+tp26 = a0
921 psubsw mm6, mm0 ; tm04-tm26 = a2
924 paddsw mm3, mm1 ; a1+b1
925 paddsw mm6, mm4 ; a2+b2
927 psubsw mm7, mm1 ; a1-b1
929 psubsw mm0, mm4 ; a2-b2
930 movq mm1, [%1+3*16] ; load b0
935 paddsw mm5, mm1 ; a0+b0
937 psubsw mm4, mm1 ; a0-b0
938 movq mm3, [%1+5*16] ; load b3
943 paddsw mm2, mm3 ; a3+b3
945 psubsw mm6, mm3 ; a3-b3
954 %macro XVID_IDCT_MMX 0
955 cglobal xvid_idct, 1, 1, 0, block
957 %define TAB tab_i_04_xmm
959 %define TAB tab_i_04_mmx
961 ; Process each row - beware of rounder offset
962 DCT_8_INV_ROW 0, TAB + 64 * 0, 0*16
963 DCT_8_INV_ROW 1, TAB + 64 * 1, 1*16
964 DCT_8_INV_ROW 2, TAB + 64 * 2, 2*16
965 DCT_8_INV_ROW 3, TAB + 64 * 3, 3*16
966 DCT_8_INV_ROW 4, TAB + 64 * 0, 6*16
967 DCT_8_INV_ROW 5, TAB + 64 * 3, 4*16
968 DCT_8_INV_ROW 6, TAB + 64 * 2, 5*16
969 DCT_8_INV_ROW 7, TAB + 64 * 1, 5*16
971 ; Process the columns (4 at a time)
983 %endif ; ~ARCH_X86_32