4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "libavutil/arm/asm.S"
26 #define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
27 #define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
28 #define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
29 #define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
30 #define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
31 #define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
32 #define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
36 #define W13 (W1 | (W3 << 16))
37 #define W26 (W2 | (W6 << 16))
38 #define W57 (W5 | (W7 << 16))
46 function idct_row_armv5te
50 ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */
57 mov v1, #(1<<(ROW_SHIFT-1))
59 sub ip, ip, #1 /* ip = W4 */
60 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
61 ldr ip, w26 /* ip = W2 | (W6 << 16) */
69 ldr ip, w13 /* ip = W1 | (W3 << 16) */
70 ldr lr, w57 /* lr = W5 | (W7 << 16) */
81 ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
97 ldr ip, w26 /* ip = W2 | (W6 << 16) */
99 sub a2, a2, #1 /* a2 = W4 */
100 smulbb a2, a2, a3 /* a2 = W4*row[4] */
101 smultb lr, ip, a4 /* lr = W6*row[6] */
102 add v1, v1, a2 /* v1 += W4*row[4] */
103 add v1, v1, lr /* v1 += W6*row[6] */
104 add v4, v4, a2 /* v4 += W4*row[4] */
105 sub v4, v4, lr /* v4 -= W6*row[6] */
106 smulbb lr, ip, a4 /* lr = W2*row[6] */
107 sub v2, v2, a2 /* v2 -= W4*row[4] */
108 sub v2, v2, lr /* v2 -= W2*row[6] */
109 sub v3, v3, a2 /* v3 -= W4*row[4] */
110 add v3, v3, lr /* v3 += W2*row[6] */
114 bic a3, a3, #0x1f0000
117 add a3, a3, a2, lsl #16
120 bic a4, a4, #0x1f0000
123 add a4, a4, a2, lsl #16
128 bic a3, a3, #0x1f0000
131 add a3, a3, a2, lsl #16
134 bic a4, a4, #0x1f0000
137 add a4, a4, a2, lsl #16
143 orr a3, a3, a3, lsl #16
154 ldr a4, [a1] /* a4 = col[1:0] */
156 sub ip, ip, #1 /* ip = W4 */
158 mov v1, #(1<<(COL_SHIFT-1))
159 smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
160 smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
161 ldr a4, [a1, #(16*4)]
163 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
164 add v2, v1, a4, asr #16
165 rsb v2, v2, v2, lsl #14
167 add v1, v1, a4, asr #16
168 ldr a4, [a1, #(16*4)]
169 rsb v1, v1, v1, lsl #14
182 ldr a4, [a1, #(16*2)]
196 ldr a4, [a1, #(16*6)]
212 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
215 ldr a4, [a1, #(16*1)]
226 ldr a4, [a1, #(16*3)]
229 smlatb v1, ip, a4, v1
230 smlatb v3, lr, a4, v3
235 smlatt v2, ip, a4, v2
236 smlatt v4, lr, a4, v4
240 ldr a4, [a1, #(16*5)]
243 smlabb v1, lr, a4, v1
244 smlabb v3, ip, a4, v3
245 smlatb v5, lr, a4, v5
246 smlatb v7, ip, a4, v7
247 smlabt v2, lr, a4, v2
248 smlabt v4, ip, a4, v4
249 smlatt v6, lr, a4, v6
250 ldr a3, [a1, #(16*7)]
251 smlatt fp, ip, a4, fp
253 smlatb v1, lr, a3, v1
254 smlabb v3, lr, a3, v3
255 smlatb v5, ip, a3, v5
257 smlatt v2, lr, a3, v2
259 smlabt v4, lr, a3, v4
261 smlatt v6, ip, a3, v6
265 function idct_col_armv5te
274 orrmi a2, a2, #0xf000
277 orr a2, a2, ip, lsl #16
282 orrmi a2, a2, #0xf000
285 orr a2, a2, a4, lsl #16
287 str a2, [a1, #(16*7)]
292 orrmi a2, a2, #0xf000
295 orr a2, a2, ip, lsl #16
296 str a2, [a1, #(16*1)]
300 orrmi a2, a2, #0xf000
303 orr a2, a2, a4, lsl #16
305 str a2, [a1, #(16*6)]
310 orrmi a2, a2, #0xf000
313 orr a2, a2, ip, lsl #16
314 str a2, [a1, #(16*2)]
318 orrmi a2, a2, #0xf000
321 orr a2, a2, a4, lsl #16
323 str a2, [a1, #(16*5)]
328 orrmi a2, a2, #0xf000
331 orr a2, a2, ip, lsl #16
332 str a2, [a1, #(16*3)]
336 orrmi a2, a2, #0xf000
339 orr a2, a2, a4, lsl #16
340 str a2, [a1, #(16*4)]
345 .macro clip dst, src:vararg
354 .macro aclip dst, src:vararg
363 function idct_col_put_armv5te
374 orr a2, a2, ip, lsl #8
383 orr a2, a3, a4, lsl #8
384 rsb v2, lr, lr, lsl #3
392 orr a2, a2, ip, lsl #8
398 orr a2, a2, a4, lsl #8
406 orr a2, a2, ip, lsl #8
412 orr a2, a2, a4, lsl #8
420 orr a2, a2, ip, lsl #8
426 orr a2, a2, a4, lsl #8
432 function idct_col_add_armv5te
444 aclip a2, v1, a2, asr #20
447 aclip v1, v1, ip, lsr #8
448 orr a2, a2, v1, lsl #8
451 rsb v2, v1, v1, lsl #3
455 aclip a3, a2, a3, asr #20
457 aclip a4, a4, ip, lsr #8
460 orr a2, a3, a4, lsl #8
468 aclip a2, v3, a2, asr #20
471 aclip v3, v3, ip, lsr #8
472 orr a2, a2, v3, lsl #8
477 aclip a3, a2, a3, asr #20
479 aclip a4, a4, ip, lsr #8
480 orr a2, a3, a4, lsl #8
488 aclip a2, v3, a2, asr #20
491 aclip v3, v3, ip, lsr #8
492 orr a2, a2, v3, lsl #8
497 aclip a3, a2, a3, asr #20
499 aclip a4, a4, ip, lsr #8
500 orr a2, a3, a4, lsl #8
508 aclip a2, v3, a2, asr #20
511 aclip v3, v3, ip, lsr #8
512 orr a2, a2, v3, lsl #8
517 aclip a3, a2, a3, asr #20
519 aclip a4, a4, ip, lsr #8
520 orr a2, a3, a4, lsl #8
526 function ff_simple_idct_armv5te, export=1
527 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
555 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
558 function ff_simple_idct_add_armv5te, export=1
559 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
581 bl idct_col_add_armv5te
583 bl idct_col_add_armv5te
585 bl idct_col_add_armv5te
587 bl idct_col_add_armv5te
590 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
593 function ff_simple_idct_put_armv5te, export=1
594 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
616 bl idct_col_put_armv5te
618 bl idct_col_put_armv5te
620 bl idct_col_put_armv5te
622 bl idct_col_put_armv5te
625 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}