2 * Copyright (C) 2002 Frederic 'dilb' Boulay
4 * Author: Frederic Boulay <dilb@handhelds.org>
6 * The function defined in this file is derived from the simple_idct function
7 * from the libavcodec library part of the FFmpeg project.
9 * This file is part of FFmpeg.
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/arm/asm.S"
28 /* useful constants for the algorithm */
36 #define MASK_MSHW 0xFFFF0000
39 #define ROW_SHIFT2MSHW (16-11)
41 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
42 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
45 function ff_simple_idct_arm, export=1
46 @@ void simple_idct_arm(int16_t *block)
47 @@ save stack for reg needed (take all of them),
48 @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
49 @@ so it must not be overwritten, if it is not saved!!
50 @@ R12 is another scratch register, so it should not be saved too
52 stmfd sp!, {r4-r11, r14} @ R14 is also called LR
53 @@ at this point, R0=block, other registers are free.
54 add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
55 @@ add 2 temporary variables in the stack: R0 and R14
56 sub sp, sp, #8 @ allow 2 local variables
57 str r0, [sp, #0] @ save block in sp[0]
63 @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
67 @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :)
68 ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
69 ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
70 ldr r3, [r14, #8] @ R3=ROWr32[2]
71 ldr r4, [r14, #12] @ R4=ROWr32[3]
72 @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
73 @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
74 @@ else follow the complete algorithm.
75 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
76 @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
77 orr r5, r4, r3 @ R5=R4 | R3
78 orr r5, r5, r2 @ R5=R4 | R3 | R2
79 orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
81 mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
82 ldrsh r6, [r14, #0] @ R6=ROWr16[0]
83 orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
84 beq __almost_empty_row
87 @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
88 @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
89 @@ R12=__const_ptr_, R14=&block[n]
90 @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
92 @@ MUL16(b0, W1, row[1]);
93 @@ MUL16(b1, W3, row[1]);
94 @@ MUL16(b2, W5, row[1]);
95 @@ MUL16(b3, W7, row[1]);
96 @@ MAC16(b0, W3, row[3]);
97 @@ MAC16(b1, -W7, row[3]);
98 @@ MAC16(b2, -W1, row[3]);
99 @@ MAC16(b3, -W5, row[3]);
101 mov r2, r2, asr #16 @ R2=ROWr16[3]
102 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
104 ldr r10, =W5 @ R10=W5
105 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
106 ldr r11, =W7 @ R11=W7
107 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
108 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
109 teq r2, #0 @ if null avoid muls
111 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
112 rsbne r2, r2, #0 @ R2=-ROWr16[3]
113 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
114 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
116 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
118 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
119 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
120 @@ R12=__const_ptr_, R14=&block[n]
121 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
123 orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
124 beq __end_b_evaluation
126 @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
127 @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
128 @@ R12=__const_ptr_, R14=&block[n]
129 @@ MAC16(b0, W5, row[5]);
130 @@ MAC16(b2, W7, row[5]);
131 @@ MAC16(b3, W3, row[5]);
132 @@ MAC16(b1, -W1, row[5]);
133 @@ MAC16(b0, W7, row[7]);
134 @@ MAC16(b2, W3, row[7]);
135 @@ MAC16(b3, -W1, row[7]);
136 @@ MAC16(b1, -W5, row[7]);
137 mov r3, r3, asr #16 @ R3=ROWr16[5]
138 teq r3, #0 @ if null avoid muls
140 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
141 mov r4, r4, asr #16 @ R4=ROWr16[7]
143 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
144 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
145 rsbne r3, r3, #0 @ R3=-ROWr16[5]
146 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
148 teq r4, #0 @ if null avoid muls
150 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
151 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
152 rsbne r4, r4, #0 @ R4=-ROWr16[7]
153 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
155 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
158 @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
159 @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
160 @@ R12=__const_ptr_, R14=&block[n]
163 @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
164 @@ a1 = a0 + W6 * row[2];
165 @@ a2 = a0 - W6 * row[2];
166 @@ a3 = a0 - W2 * row[2];
167 @@ a0 = a0 + W2 * row[2];
169 mul r6, r9, r6 @ R6=W4*ROWr16[0]
170 ldr r10, =W6 @ R10=W6
171 ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
172 add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
174 mul r11, r10, r4 @ R11=W6*ROWr16[2]
176 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
177 @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
180 beq __end_bef_a_evaluation
182 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
183 mul r11, r8, r4 @ R11=W2*ROWr16[2]
184 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
185 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
188 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
189 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
190 @@ R12=__const_ptr_, R14=&block[n]
197 ldrsh r11, [r14, #8] @ R11=ROWr16[4]
198 teq r11, #0 @ if null avoid muls
200 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
202 ldrsh r9, [r14, #12] @ R9=ROWr16[6]
204 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
205 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
206 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
207 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
208 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
209 teq r9, #0 @ if null avoid muls
211 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
212 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
213 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
218 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
220 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
221 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
224 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
225 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
226 @@ R12=__const_ptr_, R14=&block[n]
227 @@ row[0] = (a0 + b0) >> ROW_SHIFT;
228 @@ row[1] = (a1 + b1) >> ROW_SHIFT;
229 @@ row[2] = (a2 + b2) >> ROW_SHIFT;
230 @@ row[3] = (a3 + b3) >> ROW_SHIFT;
231 @@ row[4] = (a3 - b3) >> ROW_SHIFT;
232 @@ row[5] = (a2 - b2) >> ROW_SHIFT;
233 @@ row[6] = (a1 - b1) >> ROW_SHIFT;
234 @@ row[7] = (a0 - b0) >> ROW_SHIFT;
235 add r8, r6, r0 @ R8=a0+b0
236 add r9, r2, r1 @ R9=a1+b1
237 @@ put two 16-bit half-words in a 32-bit word
238 @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!)
239 ldr r10, =MASK_MSHW @ R10=0xFFFF0000
240 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
241 mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
242 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
246 add r8, r3, r5 @ R8=a2+b2
247 add r9, r4, r7 @ R9=a3+b3
248 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
249 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
253 sub r8, r4, r7 @ R8=a3-b3
254 sub r9, r3, r5 @ R9=a2-b2
255 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
256 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
260 sub r8, r2, r1 @ R8=a1-b1
261 sub r9, r6, r0 @ R9=a0-b0
262 and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
263 and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
270 @@ the row was empty, except ROWr16[0], now, management of this special case
271 @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
272 @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
273 @@ R8=0xFFFF (temp), R9-R11 free
274 mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
275 sub r8, r8, #1 @ R8 is now ready.
276 and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
277 orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
278 str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
279 str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
280 str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
281 str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
284 @@ at this point, R0-R11 (free)
285 @@ R12=__const_ptr_, R14=&block[n]
286 ldr r0, [sp, #0] @ R0=block
287 teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
293 @@ at this point, R0=block, R1-R11 (free)
294 @@ R12=__const_ptr_, R14=&block[n]
295 add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
299 @@ at this point, R0=block (temp), R1-R11 (free)
300 @@ R12=__const_ptr_, R14=&block[n]
301 @@ proceed with b0-b3 first, followed by a0-a3
302 @@ MUL16(b0, W1, col[8x1]);
303 @@ MUL16(b1, W3, col[8x1]);
304 @@ MUL16(b2, W5, col[8x1]);
305 @@ MUL16(b3, W7, col[8x1]);
306 @@ MAC16(b0, W3, col[8x3]);
307 @@ MAC16(b1, -W7, col[8x3]);
308 @@ MAC16(b2, -W1, col[8x3]);
309 @@ MAC16(b3, -W5, col[8x3]);
312 mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
314 ldr r10, =W5 @ R10=W5
315 mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
316 ldr r11, =W7 @ R11=W7
317 mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
319 mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
320 teq r2, #0 @ if 0, then avoid muls
322 mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
323 rsbne r2, r2, #0 @ R2=-ROWr16[3]
324 mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
325 mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
327 mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
330 @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
331 @@ R12=__const_ptr_, R14=&block[n]
332 @@ MAC16(b0, W5, col[5x8]);
333 @@ MAC16(b2, W7, col[5x8]);
334 @@ MAC16(b3, W3, col[5x8]);
335 @@ MAC16(b1, -W1, col[5x8]);
336 @@ MAC16(b0, W7, col[7x8]);
337 @@ MAC16(b2, W3, col[7x8]);
338 @@ MAC16(b3, -W1, col[7x8]);
339 @@ MAC16(b1, -W5, col[7x8]);
340 ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
341 teq r3, #0 @ if 0 then avoid muls
343 mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
344 mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
345 mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
346 rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
347 ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
349 mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
351 teq r4, #0 @ if 0 then avoid muls
353 mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
354 mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
355 rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
356 mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
358 mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
360 @@ __end_b_evaluation2:
361 @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
362 @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
363 @@ R12=__const_ptr_, R14=&block[n]
366 @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
367 @@ a1 = a0 + W6 * row[2];
368 @@ a2 = a0 - W6 * row[2];
369 @@ a3 = a0 - W2 * row[2];
370 @@ a0 = a0 + W2 * row[2];
373 mul r6, r9, r6 @ R6=W4*ROWr16[0]
374 ldr r10, =W6 @ R10=W6
375 ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
376 add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
377 mul r11, r10, r4 @ R11=W6*ROWr16[2]
379 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
380 sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
381 mul r11, r8, r4 @ R11=W2*ROWr16[2]
382 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
383 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
385 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
386 @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
387 @@ R12=__const_ptr_, R14=&block[n]
392 ldrsh r11, [r14, #64] @ R11=ROWr16[4]
393 teq r11, #0 @ if null avoid muls
395 mulne r11, r9, r11 @ R11=W4*ROWr16[4]
397 addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
398 subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
399 subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
400 ldrsh r9, [r14, #96] @ R9=ROWr16[6]
402 addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
403 @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
404 teq r9, #0 @ if null avoid muls
406 mulne r11, r10, r9 @ R11=W6*ROWr16[6]
407 addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
408 mulne r10, r8, r9 @ R10=W2*ROWr16[6]
413 subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
415 subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
416 addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
417 @@ __end_a_evaluation2:
418 @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
419 @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
420 @@ R12=__const_ptr_, R14=&block[n]
421 @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
422 @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
423 @@ col[16] = ((a2 + b2) >> COL_SHIFT);
424 @@ col[24] = ((a3 + b3) >> COL_SHIFT);
425 @@ col[32] = ((a3 - b3) >> COL_SHIFT);
426 @@ col[40] = ((a2 - b2) >> COL_SHIFT);
427 @@ col[48] = ((a1 - b1) >> COL_SHIFT);
428 @@ col[56] = ((a0 - b0) >> COL_SHIFT);
429 @@@@@ no optimization here @@@@@
430 add r8, r6, r0 @ R8=a0+b0
431 add r9, r2, r1 @ R9=a1+b1
432 mov r8, r8, asr #COL_SHIFT
433 mov r9, r9, asr #COL_SHIFT
436 add r8, r3, r5 @ R8=a2+b2
437 add r9, r4, r7 @ R9=a3+b3
438 mov r8, r8, asr #COL_SHIFT
439 mov r9, r9, asr #COL_SHIFT
442 sub r8, r4, r7 @ R8=a3-b3
443 sub r9, r3, r5 @ R9=a2-b2
444 mov r8, r8, asr #COL_SHIFT
445 mov r9, r9, asr #COL_SHIFT
448 sub r8, r2, r1 @ R8=a1-b1
449 sub r9, r6, r0 @ R9=a0-b0
450 mov r8, r8, asr #COL_SHIFT
451 mov r9, r9, asr #COL_SHIFT
456 @@ at this point, R0-R11 (free)
457 @@ R12=__const_ptr_, R14=&block[n]
458 ldr r0, [sp, #0] @ R0=block
459 teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
466 @@ __end_simple_idct_arm:
467 @@ restore registers to previous status!
468 add sp, sp, #8 @@ the local variables!
469 ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
473 @@ kind of sub-function, here not to overload the common case.
474 __end_bef_a_evaluation:
475 add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
476 mul r11, r8, r4 @ R11=W2*ROWr16[2]
477 sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
478 add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
479 bal __end_a_evaluation