1 ;******************************************************************************
2 ;* x86-SIMD-optimized IDCT for prores
3 ;* this is identical to "simple" IDCT written by Michael Niedermayer
4 ;* except for the clip range
6 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 ; add SECTION_RODATA and proper include before including this file!
29 ; interleave data while maintaining source
30 ; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
32 punpckl%1 m%2, m%4, m%5
33 punpckh%1 m%3, m%4, m%5
36 ; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
37 ; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
38 ; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
40 psubd %3, %1, %5 ; { a0 - b0 }[0-3]
41 psubd %4, %2, %6 ; { a0 - b0 }[4-7]
42 paddd %1, %5 ; { a0 + b0 }[0-3]
43 paddd %2, %6 ; { a0 + b0 }[4-7]
48 packssdw %1, %2 ; row[0]
49 packssdw %3, %4 ; row[7]
52 ; %1 = initial bias ("" if nop)
53 ; %2 = number of bits to shift at the end
54 ; %3 = qmat (for prores)
56 ; a0 = (W4 * row[0]) + (1 << (15 - 1));
65 mova m15, [pd_round_ %+ %2]
69 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
70 pmaddwd m2, m0, [w4_plus_w6]
71 pmaddwd m3, m1, [w4_plus_w6]
72 pmaddwd m4, m0, [w4_min_w6]
73 pmaddwd m5, m1, [w4_min_w6]
74 pmaddwd m6, m0, [w4_min_w2]
75 pmaddwd m7, m1, [w4_min_w2]
76 pmaddwd m0, [w4_plus_w2]
77 pmaddwd m1, [w4_plus_w2]
79 ; Adding 1<<(%2-1) for >=15 bits values
90 ; a0: -1*row[0]-1*row[2]
93 ; a3: -1*row[0]+1*row[2]
95 ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
96 ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
97 ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
98 ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
99 SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
100 pmaddwd m10, m8, [w4_plus_w6]
101 pmaddwd m11, m9, [w4_plus_w6]
102 paddd m0, m10 ; a0[0-3]
103 paddd m1, m11 ; a0[4-7]
104 pmaddwd m10, m8, [w4_min_w6]
105 pmaddwd m11, m9, [w4_min_w6]
106 paddd m6, m10 ; a3[0-3]
107 paddd m7, m11 ; a3[4-7]
108 pmaddwd m10, m8, [w4_min_w2]
109 pmaddwd m11, m9, [w4_min_w2]
110 pmaddwd m8, [w4_plus_w2]
111 pmaddwd m9, [w4_plus_w2]
112 psubd m4, m10 ; a2[0-3] intermediate
113 psubd m5, m11 ; a2[4-7] intermediate
114 psubd m2, m8 ; a1[0-3] intermediate
115 psubd m3, m9 ; a1[4-7] intermediate
119 mova [COEFFS+ 32], m2
120 mova [COEFFS+ 64], m4
121 mova [COEFFS+ 96], m6
122 mova m10,[COEFFS+ 16] ; { row[1] }[0-7]
123 mova m8, [COEFFS+ 48] ; { row[3] }[0-7]
124 mova m13,[COEFFS+ 80] ; { row[5] }[0-7]
125 mova m14,[COEFFS+112] ; { row[7] }[0-7]
126 mova [COEFFS+ 16], m1
127 mova [COEFFS+ 48], m3
128 mova [COEFFS+ 80], m5
129 mova [COEFFS+112], m7
137 ; b0 = MUL(W1, row[1]);
138 ; MAC(b0, W3, row[3]);
139 ; b1 = MUL(W3, row[1]);
140 ; MAC(b1, -W7, row[3]);
141 ; b2 = MUL(W5, row[1]);
142 ; MAC(b2, -W1, row[3]);
143 ; b3 = MUL(W7, row[1]);
144 ; MAC(b3, -W5, row[3]);
145 SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
146 pmaddwd m2, m0, [w3_min_w7]
147 pmaddwd m3, m1, [w3_min_w7]
148 pmaddwd m4, m0, [w5_min_w1]
149 pmaddwd m5, m1, [w5_min_w1]
150 pmaddwd m6, m0, [w7_min_w5]
151 pmaddwd m7, m1, [w7_min_w5]
152 pmaddwd m0, [w1_plus_w3]
153 pmaddwd m1, [w1_plus_w3]
155 ; b0: +1*row[1]+2*row[3]
156 ; b1: +2*row[1]-1*row[3]
157 ; b2: -1*row[1]-1*row[3]
158 ; b3: +1*row[1]+1*row[3]
160 ; MAC(b0, W5, row[5]);
161 ; MAC(b0, W7, row[7]);
162 ; MAC(b1, -W1, row[5]);
163 ; MAC(b1, -W5, row[7]);
164 ; MAC(b2, W7, row[5]);
165 ; MAC(b2, W3, row[7]);
166 ; MAC(b3, W3, row[5]);
167 ; MAC(b3, -W1, row[7]);
168 SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
170 ; b0: -1*row[5]+1*row[7]
171 ; b1: -1*row[5]+1*row[7]
172 ; b2: +1*row[5]+2*row[7]
173 ; b3: +2*row[5]-1*row[7]
175 pmaddwd m10, m8, [w1_plus_w5]
176 pmaddwd m11, m9, [w1_plus_w5]
177 pmaddwd m12, m8, [w5_plus_w7]
178 pmaddwd m13, m9, [w5_plus_w7]
179 psubd m2, m10 ; b1[0-3]
180 psubd m3, m11 ; b1[4-7]
181 paddd m0, m12 ; b0[0-3]
182 paddd m1, m13 ; b0[4-7]
183 pmaddwd m12, m8, [w7_plus_w3]
184 pmaddwd m13, m9, [w7_plus_w3]
185 pmaddwd m8, [w3_min_w1]
186 pmaddwd m9, [w3_min_w1]
187 paddd m4, m12 ; b2[0-3]
188 paddd m5, m13 ; b2[4-7]
189 paddd m6, m8 ; b3[0-3]
190 paddd m7, m9 ; b3[4-7]
192 ; row[0] = (a0 + b0) >> 15;
193 ; row[7] = (a0 - b0) >> 15;
194 ; row[1] = (a1 + b1) >> 15;
195 ; row[6] = (a1 - b1) >> 15;
196 ; row[2] = (a2 + b2) >> 15;
197 ; row[5] = (a2 - b2) >> 15;
198 ; row[3] = (a3 + b3) >> 15;
199 ; row[4] = (a3 - b3) >> 15;
200 mova m8, [COEFFS+ 0] ; a0[0-3]
201 mova m9, [COEFFS+16] ; a0[4-7]
202 SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
203 mova m0, [COEFFS+32] ; a1[0-3]
204 mova m1, [COEFFS+48] ; a1[4-7]
205 SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
206 mova m1, [COEFFS+64] ; a2[0-3]
207 mova m2, [COEFFS+80] ; a2[4-7]
208 SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
209 mova m2, [COEFFS+96] ; a3[0-3]
210 mova m3, [COEFFS+112] ; a3[4-7]
211 SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
214 ; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
215 ; int16_t *block, const int16_t *qmat);
218 ; %2 = row bias macro
220 ; %4 = column bias macro
221 ; %5 = min pixel value
222 ; %6 = max pixel value
223 ; %7 = qmat (for prores)
227 ; No clamping, means pure idct
234 ; for (i = 0; i < 8; i++)
235 ; idctRowCondDC(block + i*8);
236 mova m10,[COEFFS+ 0] ; { row[0] }[0-7]
237 mova m8, [COEFFS+32] ; { row[2] }[0-7]
238 mova m13,[COEFFS+64] ; { row[4] }[0-7]
239 mova m12,[COEFFS+96] ; { row[6] }[0-7]
252 ; transpose for second part of IDCT
253 TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
254 mova [COEFFS+ 16], m0
255 mova [COEFFS+ 48], m2
256 mova [COEFFS+ 80], m11
257 mova [COEFFS+112], m10
263 ; for (i = 0; i < 8; i++)
264 ; idctSparseColAdd(dest + i, line_size, block + i);
269 ; No clamping, means pure idct