git.sesse.net Git - ffmpeg/blob - libavcodec/arm/simple_idct_arm.S

   1 /*
   2  * Copyright (C) 2002 Frederic 'dilb' Boulay
   3  *
   4  * Author: Frederic Boulay <dilb@handhelds.org>
   5  *
   6  * The function defined in this file is derived from the simple_idct function
   7  * from the libavcodec library part of the FFmpeg project.
   8  *
   9  * This file is part of FFmpeg.
  10  *
  11  * FFmpeg is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * FFmpeg is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with FFmpeg; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/arm/asm.S"
  27
  28 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  29 /* the end of the source code.*/
  30 #define W1  22725
  31 #define W2  21407
  32 #define W3  19266
  33 #define W4  16383
  34 #define W5  12873
  35 #define W6  8867
  36 #define W7  4520
  37 #define MASK_MSHW 0xFFFF0000
  38
  39 /* offsets of the constants in the vector */
  40 #define offW1  0
  41 #define offW2  4
  42 #define offW3  8
  43 #define offW4  12
  44 #define offW5  16
  45 #define offW6  20
  46 #define offW7  24
  47 #define offMASK_MSHW 28
  48
  49 #define ROW_SHIFT 11
  50 #define ROW_SHIFT2MSHW (16-11)
  51 #define COL_SHIFT 20
  52 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  53 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  54
  55
  56 function ff_simple_idct_arm, export=1
  57         @@ void simple_idct_arm(int16_t *block)
  58         @@ save stack for reg needed (take all of them),
  59         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  60         @@ so it must not be overwritten, if it is not saved!!
  61         @@ R12 is another scratch register, so it should not be saved too
  62         @@ save all registers
  63         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  64         @@ at this point, R0=block, other registers are free.
  65         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  66         adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  67         @@ add 2 temporary variables in the stack: R0 and R14
  68         sub sp, sp, #8          @ allow 2 local variables
  69         str r0, [sp, #0]        @ save block in sp[0]
  70         @@ stack status
  71         @@ sp+4   free
  72         @@ sp+0   R0  (block)
  73
  74
  75         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  76
  77
  78 __row_loop:
  79         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  80         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  81         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  82         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  83         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  84         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  85         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  86         @@ else follow the complete algorithm.
  87         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  88         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  89         orr r5, r4, r3           @ R5=R4 | R3
  90         orr r5, r5, r2           @ R5=R4 | R3 | R2
  91         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  92         beq __end_row_loop
  93         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  94         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  95         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  96         beq __almost_empty_row
  97
  98 __b_evaluation:
  99         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 100         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 101         @@     R12=__const_ptr_, R14=&block[n]
 102         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 103
 104         @@ MUL16(b0, W1, row[1]);
 105         @@ MUL16(b1, W3, row[1]);
 106         @@ MUL16(b2, W5, row[1]);
 107         @@ MUL16(b3, W7, row[1]);
 108         @@ MAC16(b0, W3, row[3]);
 109         @@ MAC16(b1, -W7, row[3]);
 110         @@ MAC16(b2, -W1, row[3]);
 111         @@ MAC16(b3, -W5, row[3]);
 112         ldr r8, [r12, #offW1]    @ R8=W1
 113         mov r2, r2, asr #16      @ R2=ROWr16[3]
 114         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 115         ldr r9, [r12, #offW3]    @ R9=W3
 116         ldr r10, [r12, #offW5]   @ R10=W5
 117         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 118         ldr r11, [r12, #offW7]   @ R11=W7
 119         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 120         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 121         teq r2, #0               @ if null avoid muls
 122         itttt ne
 123         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 124         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 125         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 126         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 127         it    ne
 128         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 129
 130         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 131         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 132         @@     R12=__const_ptr_, R14=&block[n]
 133         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 134         @@ if (temp != 0) {}
 135         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 136         beq __end_b_evaluation
 137
 138         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 139         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 140         @@     R12=__const_ptr_, R14=&block[n]
 141         @@ MAC16(b0, W5, row[5]);
 142         @@ MAC16(b2, W7, row[5]);
 143         @@ MAC16(b3, W3, row[5]);
 144         @@ MAC16(b1, -W1, row[5]);
 145         @@ MAC16(b0, W7, row[7]);
 146         @@ MAC16(b2, W3, row[7]);
 147         @@ MAC16(b3, -W1, row[7]);
 148         @@ MAC16(b1, -W5, row[7]);
 149         mov r3, r3, asr #16      @ R3=ROWr16[5]
 150         teq r3, #0               @ if null avoid muls
 151         it    ne
 152         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 153         mov r4, r4, asr #16      @ R4=ROWr16[7]
 154         itttt ne
 155         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 156         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 157         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 158         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 159         @@ R3 is free now
 160         teq r4, #0               @ if null avoid muls
 161         itttt ne
 162         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 163         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 164         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 165         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 166         it    ne
 167         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 168         @@ R4 is free now
 169 __end_b_evaluation:
 170         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 171         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 172         @@     R12=__const_ptr_, R14=&block[n]
 173
 174 __a_evaluation:
 175         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 176         @@ a1 = a0 + W6 * row[2];
 177         @@ a2 = a0 - W6 * row[2];
 178         @@ a3 = a0 - W2 * row[2];
 179         @@ a0 = a0 + W2 * row[2];
 180         ldr r9, [r12, #offW4]    @ R9=W4
 181         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 182         ldr r10, [r12, #offW6]   @ R10=W6
 183         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 184         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 185
 186         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 187         ldr r8, [r12, #offW2]    @ R8=W2
 188         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 189         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 190         @@ if (temp != 0) {}
 191         teq r2, #0
 192         beq __end_bef_a_evaluation
 193
 194         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 195         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 196         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 197         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 198
 199
 200         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 201         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 202         @@     R12=__const_ptr_, R14=&block[n]
 203
 204
 205         @@ a0 += W4*row[4]
 206         @@ a1 -= W4*row[4]
 207         @@ a2 -= W4*row[4]
 208         @@ a3 += W4*row[4]
 209         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 210         teq r11, #0              @ if null avoid muls
 211         it    ne
 212         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 213         @@ R9 is free now
 214         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 215         itttt ne
 216         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 217         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 218         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 219         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 220         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 221         teq r9, #0               @ if null avoid muls
 222         itttt ne
 223         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 224         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 225         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 226         @@ a0 += W6*row[6];
 227         @@ a3 -= W6*row[6];
 228         @@ a1 -= W2*row[6];
 229         @@ a2 += W2*row[6];
 230         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 231         itt   ne
 232         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 233         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 234
 235 __end_a_evaluation:
 236         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 237         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 238         @@     R12=__const_ptr_, R14=&block[n]
 239         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 240         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 241         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 242         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 243         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 244         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 245         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 246         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 247         add r8, r6, r0           @ R8=a0+b0
 248         add r9, r2, r1           @ R9=a1+b1
 249         @@ put 2 16 bits half-words in a 32bits word
 250         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 251         ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 252         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 253         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 254         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 255         orr r8, r8, r9
 256         str r8, [r14, #0]
 257
 258         add r8, r3, r5           @ R8=a2+b2
 259         add r9, r4, r7           @ R9=a3+b3
 260         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 261         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 262         orr r8, r8, r9
 263         str r8, [r14, #4]
 264
 265         sub r8, r4, r7           @ R8=a3-b3
 266         sub r9, r3, r5           @ R9=a2-b2
 267         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 268         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 269         orr r8, r8, r9
 270         str r8, [r14, #8]
 271
 272         sub r8, r2, r1           @ R8=a1-b1
 273         sub r9, r6, r0           @ R9=a0-b0
 274         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 275         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 276         orr r8, r8, r9
 277         str r8, [r14, #12]
 278
 279         bal __end_row_loop
 280
 281 __almost_empty_row:
 282         @@ the row was empty, except ROWr16[0], now, management of this special case
 283         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 284         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 285         @@                R8=0xFFFF (temp), R9-R11 free
 286         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 287         sub r8, r8, #1           @ R8 is now ready.
 288         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 289         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 290         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 291         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 292         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 293         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 294
 295 __end_row_loop:
 296         @@ at this point, R0-R11 (free)
 297         @@     R12=__const_ptr_, R14=&block[n]
 298         ldr r0, [sp, #0]         @ R0=block
 299         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 300         sub r14, r14, #16
 301         bne __row_loop
 302
 303
 304
 305         @@ at this point, R0=block, R1-R11 (free)
 306         @@     R12=__const_ptr_, R14=&block[n]
 307         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 308 __col_loop:
 309
 310 __b_evaluation2:
 311         @@ at this point, R0=block (temp),  R1-R11 (free)
 312         @@     R12=__const_ptr_, R14=&block[n]
 313         @@ proceed with b0-b3 first, followed by a0-a3
 314         @@ MUL16(b0, W1, col[8x1]);
 315         @@ MUL16(b1, W3, col[8x1]);
 316         @@ MUL16(b2, W5, col[8x1]);
 317         @@ MUL16(b3, W7, col[8x1]);
 318         @@ MAC16(b0, W3, col[8x3]);
 319         @@ MAC16(b1, -W7, col[8x3]);
 320         @@ MAC16(b2, -W1, col[8x3]);
 321         @@ MAC16(b3, -W5, col[8x3]);
 322         ldr r8, [r12, #offW1]    @ R8=W1
 323         ldrsh r7, [r14, #16]
 324         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 325         ldr r9, [r12, #offW3]    @ R9=W3
 326         ldr r10, [r12, #offW5]   @ R10=W5
 327         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 328         ldr r11, [r12, #offW7]   @ R11=W7
 329         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 330         ldrsh r2, [r14, #48]
 331         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 332         teq r2, #0               @ if 0, then avoid muls
 333         itttt ne
 334         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 335         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 336         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 337         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 338         it    ne
 339         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 340
 341         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 342         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 343         @@     R12=__const_ptr_, R14=&block[n]
 344         @@ MAC16(b0, W5, col[5x8]);
 345         @@ MAC16(b2, W7, col[5x8]);
 346         @@ MAC16(b3, W3, col[5x8]);
 347         @@ MAC16(b1, -W1, col[5x8]);
 348         @@ MAC16(b0, W7, col[7x8]);
 349         @@ MAC16(b2, W3, col[7x8]);
 350         @@ MAC16(b3, -W1, col[7x8]);
 351         @@ MAC16(b1, -W5, col[7x8]);
 352         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 353         teq r3, #0               @ if 0 then avoid muls
 354         itttt ne
 355         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 356         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 357         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 358         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 359         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 360         it    ne
 361         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 362         @@ R3 is free now
 363         teq r4, #0               @ if 0 then avoid muls
 364         itttt ne
 365         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 366         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 367         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 368         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 369         it    ne
 370         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 371         @@ R4 is free now
 372 __end_b_evaluation2:
 373         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 374         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 375         @@     R12=__const_ptr_, R14=&block[n]
 376
 377 __a_evaluation2:
 378         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 379         @@ a1 = a0 + W6 * row[2];
 380         @@ a2 = a0 - W6 * row[2];
 381         @@ a3 = a0 - W2 * row[2];
 382         @@ a0 = a0 + W2 * row[2];
 383         ldrsh r6, [r14, #0]
 384         ldr r9, [r12, #offW4]    @ R9=W4
 385         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 386         ldr r10, [r12, #offW6]   @ R10=W6
 387         ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
 388         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 389         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 390         ldr r8, [r12, #offW2]    @ R8=W2
 391         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 392         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 393         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 394         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 395         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 396
 397         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 398         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 399         @@     R12=__const_ptr_, R14=&block[n]
 400         @@ a0 += W4*row[4]
 401         @@ a1 -= W4*row[4]
 402         @@ a2 -= W4*row[4]
 403         @@ a3 += W4*row[4]
 404         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
 405         teq r11, #0              @ if null avoid muls
 406         itttt ne
 407         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 408         @@ R9 is free now
 409         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 410         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 411         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 412         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 413         it    ne
 414         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 415         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 416         teq r9, #0               @ if null avoid muls
 417         itttt ne
 418         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 419         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 420         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 421         @@ a0 += W6*row[6];
 422         @@ a3 -= W6*row[6];
 423         @@ a1 -= W2*row[6];
 424         @@ a2 += W2*row[6];
 425         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 426         itt   ne
 427         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 428         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 429 __end_a_evaluation2:
 430         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 431         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 432         @@     R12=__const_ptr_, R14=&block[n]
 433         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 434         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 435         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 436         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 437         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 438         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 439         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 440         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 441         @@@@@ no optimization here @@@@@
 442         add r8, r6, r0           @ R8=a0+b0
 443         add r9, r2, r1           @ R9=a1+b1
 444         mov r8, r8, asr #COL_SHIFT
 445         mov r9, r9, asr #COL_SHIFT
 446         strh r8, [r14, #0]
 447         strh r9, [r14, #16]
 448         add r8, r3, r5           @ R8=a2+b2
 449         add r9, r4, r7           @ R9=a3+b3
 450         mov r8, r8, asr #COL_SHIFT
 451         mov r9, r9, asr #COL_SHIFT
 452         strh r8, [r14, #32]
 453         strh r9, [r14, #48]
 454         sub r8, r4, r7           @ R8=a3-b3
 455         sub r9, r3, r5           @ R9=a2-b2
 456         mov r8, r8, asr #COL_SHIFT
 457         mov r9, r9, asr #COL_SHIFT
 458         strh r8, [r14, #64]
 459         strh r9, [r14, #80]
 460         sub r8, r2, r1           @ R8=a1-b1
 461         sub r9, r6, r0           @ R9=a0-b0
 462         mov r8, r8, asr #COL_SHIFT
 463         mov r9, r9, asr #COL_SHIFT
 464         strh r8, [r14, #96]
 465         strh r9, [r14, #112]
 466
 467 __end_col_loop:
 468         @@ at this point, R0-R11 (free)
 469         @@     R12=__const_ptr_, R14=&block[n]
 470         ldr r0, [sp, #0]         @ R0=block
 471         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 472         sub r14, r14, #2
 473         bne __col_loop
 474
 475
 476
 477
 478 __end_simple_idct_arm:
 479         @@ restore registers to previous status!
 480         add sp, sp, #8 @@ the local variables!
 481         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 482
 483
 484
 485 @@ kind of sub-function, here not to overload the common case.
 486 __end_bef_a_evaluation:
 487         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 488         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 489         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 490         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 491         bal __end_a_evaluation
 492
 493
 494         .align
 495 __constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 496         .word   W1
 497         .word   W2
 498         .word   W3
 499         .word   W4
 500         .word   W5
 501         .word   W6
 502         .word   W7
 503         .word   MASK_MSHW