4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavutil/arm/asm.S"
26 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
36 #define W13 (W1 | (W3 << 16))
37 #define W26 (W2 | (W6 << 16))
38 #define W57 (W5 | (W7 << 16))
40 function idct_row_armv5te
44 ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */
51 mov v1, #(1<<(ROW_SHIFT-1))
53 sub ip, ip, #1 /* ip = W4 */
54 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
55 ldr ip, =W26 /* ip = W2 | (W6 << 16) */
63 ldr ip, =W13 /* ip = W1 | (W3 << 16) */
64 ldr lr, =W57 /* lr = W5 | (W7 << 16) */
75 ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
91 ldr ip, =W26 /* ip = W2 | (W6 << 16) */
93 sub a2, a2, #1 /* a2 = W4 */
94 smulbb a2, a2, a3 /* a2 = W4*row[4] */
95 smultb lr, ip, a4 /* lr = W6*row[6] */
96 add v1, v1, a2 /* v1 += W4*row[4] */
97 add v1, v1, lr /* v1 += W6*row[6] */
98 add v4, v4, a2 /* v4 += W4*row[4] */
99 sub v4, v4, lr /* v4 -= W6*row[6] */
100 smulbb lr, ip, a4 /* lr = W2*row[6] */
101 sub v2, v2, a2 /* v2 -= W4*row[4] */
102 sub v2, v2, lr /* v2 -= W2*row[6] */
103 sub v3, v3, a2 /* v3 -= W4*row[4] */
104 add v3, v3, lr /* v3 += W2*row[6] */
108 bic a3, a3, #0x1f0000
111 add a3, a3, a2, lsl #16
114 bic a4, a4, #0x1f0000
117 add a4, a4, a2, lsl #16
122 bic a3, a3, #0x1f0000
125 add a3, a3, a2, lsl #16
128 bic a4, a4, #0x1f0000
131 add a4, a4, a2, lsl #16
132 strd a3, a4, [a1, #8]
137 orr a3, a3, a3, lsl #16
142 strd a3, a4, [a1, #8]
148 ldr a4, [a1] /* a4 = col[1:0] */
150 sub ip, ip, #1 /* ip = W4 */
151 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
152 add v2, v1, a4, asr #16
153 rsb v2, v2, v2, lsl #14
155 add v1, v1, a4, asr #16
156 ldr a4, [a1, #(16*4)]
157 rsb v1, v1, v1, lsl #14
169 ldr a4, [a1, #(16*2)]
183 ldr a4, [a1, #(16*6)]
199 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
202 ldr a4, [a1, #(16*1)]
213 ldr a4, [a1, #(16*3)]
216 smlatb v1, ip, a4, v1
217 smlatb v3, lr, a4, v3
222 smlatt v2, ip, a4, v2
223 smlatt v4, lr, a4, v4
227 ldr a4, [a1, #(16*5)]
230 smlabb v1, lr, a4, v1
231 smlabb v3, ip, a4, v3
232 smlatb v5, lr, a4, v5
233 smlatb v7, ip, a4, v7
234 smlabt v2, lr, a4, v2
235 smlabt v4, ip, a4, v4
236 smlatt v6, lr, a4, v6
237 ldr a3, [a1, #(16*7)]
238 smlatt fp, ip, a4, fp
240 smlatb v1, lr, a3, v1
241 smlabb v3, lr, a3, v3
242 smlatb v5, ip, a3, v5
244 smlatt v2, lr, a3, v2
246 smlabt v4, lr, a3, v4
248 smlatt v6, ip, a3, v6
252 function idct_col_armv5te
261 orrmi a2, a2, #0xf000
264 orr a2, a2, ip, lsl #16
269 orrmi a2, a2, #0xf000
272 orr a2, a2, a4, lsl #16
274 str a2, [a1, #(16*7)]
279 orrmi a2, a2, #0xf000
282 orr a2, a2, ip, lsl #16
283 str a2, [a1, #(16*1)]
287 orrmi a2, a2, #0xf000
290 orr a2, a2, a4, lsl #16
292 str a2, [a1, #(16*6)]
297 orrmi a2, a2, #0xf000
300 orr a2, a2, ip, lsl #16
301 str a2, [a1, #(16*2)]
305 orrmi a2, a2, #0xf000
308 orr a2, a2, a4, lsl #16
310 str a2, [a1, #(16*5)]
315 orrmi a2, a2, #0xf000
318 orr a2, a2, ip, lsl #16
319 str a2, [a1, #(16*3)]
323 orrmi a2, a2, #0xf000
326 orr a2, a2, a4, lsl #16
327 str a2, [a1, #(16*4)]
332 .macro clip dst, src:vararg
341 .macro aclip dst, src:vararg
350 function idct_col_put_armv5te
361 orr a2, a2, ip, lsl #8
370 orr a2, a3, a4, lsl #8
371 rsb v2, lr, lr, lsl #3
379 orr a2, a2, ip, lsl #8
385 orr a2, a2, a4, lsl #8
393 orr a2, a2, ip, lsl #8
399 orr a2, a2, a4, lsl #8
407 orr a2, a2, ip, lsl #8
413 orr a2, a2, a4, lsl #8
419 function idct_col_add_armv5te
431 aclip a2, v1, a2, asr #20
434 aclip v1, v1, ip, lsr #8
435 orr a2, a2, v1, lsl #8
438 rsb v2, v1, v1, lsl #3
442 aclip a3, a2, a3, asr #20
444 aclip a4, a4, ip, lsr #8
447 orr a2, a3, a4, lsl #8
455 aclip a2, v3, a2, asr #20
458 aclip v3, v3, ip, lsr #8
459 orr a2, a2, v3, lsl #8
464 aclip a3, a2, a3, asr #20
466 aclip a4, a4, ip, lsr #8
467 orr a2, a3, a4, lsl #8
475 aclip a2, v3, a2, asr #20
478 aclip v3, v3, ip, lsr #8
479 orr a2, a2, v3, lsl #8
484 aclip a3, a2, a3, asr #20
486 aclip a4, a4, ip, lsr #8
487 orr a2, a3, a4, lsl #8
495 aclip a2, v3, a2, asr #20
498 aclip v3, v3, ip, lsr #8
499 orr a2, a2, v3, lsl #8
504 aclip a3, a2, a3, asr #20
506 aclip a4, a4, ip, lsr #8
507 orr a2, a3, a4, lsl #8
513 function ff_simple_idct_armv5te, export=1
514 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
542 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
545 function ff_simple_idct_add_armv5te, export=1
546 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
568 bl idct_col_add_armv5te
570 bl idct_col_add_armv5te
572 bl idct_col_add_armv5te
574 bl idct_col_add_armv5te
577 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
580 function ff_simple_idct_put_armv5te, export=1
581 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
603 bl idct_col_put_armv5te
605 bl idct_col_put_armv5te
607 bl idct_col_put_armv5te
609 bl idct_col_put_armv5te
612 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}