1 ; Chinese AVS video (AVS1-P2, JiZhun profile) decoder
2 ; Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
4 ; MMX-optimized DSP functions, based on H.264 optimizations by
5 ; Michael Niedermayer and Loren Merritt
6 ; Conversion from gcc syntax to x264asm syntax with modifications
7 ; by Ronald S. Bultje <rsbultje@gmail.com>
9 ; This file is part of FFmpeg.
11 ; FFmpeg is free software; you can redistribute it and/or
12 ; modify it under the terms of the GNU Lesser General Public
13 ; License as published by the Free Software Foundation; either
14 ; version 2.1 of the License, or (at your option) any later version.
16 ; FFmpeg is distributed in the hope that it will be useful,
17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 ; Lesser General Public License for more details.
21 ; You should have received a copy of the GNU Lesser General Public License
22 ; along with FFmpeg; if not, write to the Free Software Foundation,
23 ; Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 %include "libavutil/x86/x86util.asm"
32 %macro CAVS_IDCT8_1D 2-3 1 ; source, round, init_load
34 mova m4, [%1+7*16] ; m4 = src7
35 mova m5, [%1+1*16] ; m5 = src1
36 mova m2, [%1+5*16] ; m2 = src5
37 mova m7, [%1+3*16] ; m7 = src3
47 paddw m4, m4 ; m4 = 2*src7
48 paddw m3, m3 ; m3 = 2*src1
49 paddw m6, m6 ; m6 = 2*src5
50 paddw m1, m1 ; m1 = 2*src3
51 paddw m0, m4 ; m0 = 3*src7
52 paddw m5, m3 ; m5 = 3*src1
53 paddw m2, m6 ; m2 = 3*src5
54 paddw m7, m1 ; m7 = 3*src3
55 psubw m5, m4 ; m5 = 3*src1 - 2*src7 = a0
56 paddw m7, m6 ; m7 = 3*src3 - 2*src5 = a1
57 psubw m1, m2 ; m1 = 2*src3 - 3*src5 = a2
58 paddw m3, m0 ; m3 = 2*src1 - 3*src7 = a3
64 SUMSUB_BA w, 7, 5 ; m7 = a0 + a1, m5 = a0 - a1
65 paddw m7, m3 ; m7 = a0 + a1 + a3
66 paddw m5, m1 ; m5 = a0 - a1 + a2
69 paddw m7, m6 ; m7 = b4
70 paddw m5, m4 ; m5 = b5
72 SUMSUB_BA w, 1, 3 ; m1 = a3 + a2, m3 = a3 - a2
73 psubw m4, m1 ; m4 = a0 - a2 - a3
74 mova m1, m4 ; m1 = a0 - a2 - a3
75 psubw m3, m6 ; m3 = a3 - a2 - a1
78 psubw m1, m2 ; m1 = b7
79 paddw m3, m0 ; m3 = b6
81 mova m2, [%1+2*16] ; m2 = src2
82 mova m6, [%1+6*16] ; m6 = src6
85 psllw m4, 2 ; m4 = 4*src2
86 psllw m6, 2 ; m6 = 4*src6
87 paddw m2, m4 ; m2 = 5*src2
88 paddw m0, m6 ; m0 = 5*src6
91 psubw m4, m0 ; m4 = 4*src2 - 10*src6 = a7
92 paddw m6, m2 ; m6 = 4*src6 + 10*src2 = a6
94 mova m2, [%1+0*16] ; m2 = src0
95 mova m0, [%1+4*16] ; m0 = src4
96 SUMSUB_BA w, 0, 2 ; m0 = src0 + src4, m2 = src0 - src4
99 paddw m0, %2 ; add rounding bias
100 paddw m2, %2 ; add rounding bias
102 SUMSUB_BA w, 6, 0 ; m6 = a4 + a6, m0 = a4 - a6
103 SUMSUB_BA w, 4, 2 ; m4 = a5 + a7, m2 = a5 - a7
104 SUMSUB_BA w, 7, 6 ; m7 = dst0, m6 = dst7
105 SUMSUB_BA w, 5, 4 ; m5 = dst1, m4 = dst6
106 SUMSUB_BA w, 3, 2 ; m3 = dst2, m2 = dst5
107 SUMSUB_BA w, 1, 0 ; m1 = dst3, m0 = dst4
111 cglobal cavs_idct8, 2, 4, 8, 8 * 16, out, in, cnt, tmp
116 CAVS_IDCT8_1D inq, [pw_4]
126 TRANSPOSE4x4W 0, 2, 4, 6, 7
132 TRANSPOSE4x4W 7, 5, 3, 1, 0
146 CAVS_IDCT8_1D tmpq, [pw_64]
173 cglobal cavs_idct8, 2, 2, 8 + ARCH_X86_64, 0 - 8 * 16, out, in
174 CAVS_IDCT8_1D inq, [pw_4]
184 TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, 8
188 TRANSPOSE8x8W 7, 5, 3, 1, 0, 2, 4, 6, [rsp+0*16], [rsp+4*16], 1
193 CAVS_IDCT8_1D rsp, [pw_64], 0