git.sesse.net Git - ffmpeg/blob - libavcodec/armv4l/simple_idct_arm.S

   1 /*
   2  * simple_idct_arm.S
   3  * Copyright (C) 2002 Frederic 'dilb' Boulay.
   4  * All Rights Reserved.
   5  *
   6  * Author: Frederic Boulay <dilb@handhelds.org>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  *
  24  * The function defined in this file, is derived from the simple_idct function
  25  * from the libavcodec library part of the ffmpeg project.
  26  */
  27
  28 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  29 /* the end of the source code.*/
  30 #define W1  22725
  31 #define W2  21407
  32 #define W3  19266
  33 #define W4  16383
  34 #define W5  12873
  35 #define W6  8867
  36 #define W7  4520
  37 #define MASK_MSHW 0xFFFF0000
  38
  39 /* offsets of the constants in the vector */
  40 #define offW1  0
  41 #define offW2  4
  42 #define offW3  8
  43 #define offW4  12
  44 #define offW5  16
  45 #define offW6  20
  46 #define offW7  24
  47 #define offMASK_MSHW 28
  48
  49 #define ROW_SHIFT 11
  50 #define ROW_SHIFT2MSHW (16-11)
  51 #define COL_SHIFT 20
  52 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  53 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  54
  55
  56         .text
  57         .align
  58         .global simple_idct_ARM
  59
  60 simple_idct_ARM:
  61         @@ void simple_idct_ARM(int16_t *block)
  62         @@ save stack for reg needed (take all of them),
  63         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  64         @@ so it must not be overwritten, if it is not saved!!
  65         @@ R12 is another scratch register, so it should not be saved too
  66         @@ save all registers
  67         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  68         @@ at this point, R0=block, other registers are free.
  69         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  70         add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  71         @@ add 2 temporary variables in the stack: R0 and R14
  72         sub sp, sp, #8          @ allow 2 local variables
  73         str r0, [sp, #0]        @ save block in sp[0]
  74         @@ stack status
  75         @@ sp+4   free
  76         @@ sp+0   R0  (block)
  77
  78
  79         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  80
  81
  82 __row_loop:
  83         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  84         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  85         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  86         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  87         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  88         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  89         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  90         @@ else follow the complete algorithm.
  91         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  92         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  93         orr r5, r4, r3           @ R5=R4 | R3
  94         orr r5, r5, r2           @ R5=R4 | R3 | R2
  95         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  96         beq __end_row_loop
  97         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  98         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  99         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
 100         beq __almost_empty_row
 101
 102 __b_evaluation:
 103         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 104         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 105         @@     R12=__const_ptr_, R14=&block[n]
 106         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 107
 108         @@ MUL16(b0, W1, row[1]);
 109         @@ MUL16(b1, W3, row[1]);
 110         @@ MUL16(b2, W5, row[1]);
 111         @@ MUL16(b3, W7, row[1]);
 112         @@ MAC16(b0, W3, row[3]);
 113         @@ MAC16(b1, -W7, row[3]);
 114         @@ MAC16(b2, -W1, row[3]);
 115         @@ MAC16(b3, -W5, row[3]);
 116         ldr r8, [r12, #offW1]    @ R8=W1
 117         mov r2, r2, asr #16      @ R2=ROWr16[3]
 118         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 119         ldr r9, [r12, #offW3]    @ R9=W3
 120         ldr r10, [r12, #offW5]   @ R10=W5
 121         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 122         ldr r11, [r12, #offW7]   @ R11=W7
 123         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 124         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 125                 teq r2, #0               @ if null avoid muls
 126                 mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 127         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 128         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 129         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 130         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 131
 132         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 133         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 134         @@     R12=__const_ptr_, R14=&block[n]
 135         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 136         @@ if (temp != 0) {}
 137         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 138         beq __end_b_evaluation
 139
 140         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 141         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 142         @@     R12=__const_ptr_, R14=&block[n]
 143         @@ MAC16(b0, W5, row[5]);
 144         @@ MAC16(b2, W7, row[5]);
 145         @@ MAC16(b3, W3, row[5]);
 146         @@ MAC16(b1, -W1, row[5]);
 147         @@ MAC16(b0, W7, row[7]);
 148         @@ MAC16(b2, W3, row[7]);
 149         @@ MAC16(b3, -W1, row[7]);
 150         @@ MAC16(b1, -W5, row[7]);
 151         mov r3, r3, asr #16      @ R3=ROWr16[5]
 152                 teq r3, #0               @ if null avoid muls
 153         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 154         mov r4, r4, asr #16      @ R4=ROWr16[7]
 155         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 156         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 157         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 158         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 159         @@ R3 is free now
 160                 teq r4, #0               @ if null avoid muls
 161         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 162         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 163         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 164         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 165         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 166         @@ R4 is free now
 167 __end_b_evaluation:
 168         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 169         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 170         @@     R12=__const_ptr_, R14=&block[n]
 171
 172 __a_evaluation:
 173         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 174         @@ a1 = a0 + W6 * row[2];
 175         @@ a2 = a0 - W6 * row[2];
 176         @@ a3 = a0 - W2 * row[2];
 177         @@ a0 = a0 + W2 * row[2];
 178         ldr r9, [r12, #offW4]    @ R9=W4
 179         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 180         ldr r10, [r12, #offW6]   @ R10=W6
 181         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 182         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 183
 184         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 185         ldr r8, [r12, #offW2]    @ R8=W2
 186         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 187         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 188         @@ if (temp != 0) {}
 189         teq r2, #0
 190         beq __end_bef_a_evaluation
 191
 192         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 193         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 194         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 195         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 196
 197
 198         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 199         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 200         @@     R12=__const_ptr_, R14=&block[n]
 201
 202
 203         @@ a0 += W4*row[4]
 204         @@ a1 -= W4*row[4]
 205         @@ a2 -= W4*row[4]
 206         @@ a3 += W4*row[4]
 207         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 208                 teq r11, #0              @ if null avoid muls
 209         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 210         @@ R9 is free now
 211         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 212         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 213         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 214         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 215         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 216         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 217                 teq r9, #0               @ if null avoid muls
 218         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 219         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 220         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 221         @@ a0 += W6*row[6];
 222         @@ a3 -= W6*row[6];
 223         @@ a1 -= W2*row[6];
 224         @@ a2 += W2*row[6];
 225         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 226         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 227         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 228
 229 __end_a_evaluation:
 230         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 231         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 232         @@     R12=__const_ptr_, R14=&block[n]
 233         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 234         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 235         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 236         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 237         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 238         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 239         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 240         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 241         add r8, r6, r0           @ R8=a0+b0
 242         add r9, r2, r1           @ R9=a1+b1
 243         @@ put 2 16 bits half-words in a 32bits word
 244         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 245         ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 246         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 247         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 248         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 249         orr r8, r8, r9
 250         str r8, [r14, #0]
 251
 252         add r8, r3, r5           @ R8=a2+b2
 253         add r9, r4, r7           @ R9=a3+b3
 254         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 255         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 256         orr r8, r8, r9
 257         str r8, [r14, #4]
 258
 259         sub r8, r4, r7           @ R8=a3-b3
 260         sub r9, r3, r5           @ R9=a2-b2
 261         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 262         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 263         orr r8, r8, r9
 264         str r8, [r14, #8]
 265
 266         sub r8, r2, r1           @ R8=a1-b1
 267         sub r9, r6, r0           @ R9=a0-b0
 268         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 269         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 270         orr r8, r8, r9
 271         str r8, [r14, #12]
 272
 273         bal __end_row_loop
 274
 275 __almost_empty_row:
 276         @@ the row was empty, except ROWr16[0], now, management of this special case
 277         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 278         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 279         @@                R8=0xFFFF (temp), R9-R11 free
 280         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 281         sub r8, r8, #1           @ R8 is now ready.
 282         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 283         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 284         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 285         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 286         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 287         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 288
 289 __end_row_loop:
 290         @@ at this point, R0-R11 (free)
 291         @@     R12=__const_ptr_, R14=&block[n]
 292         ldr r0, [sp, #0]         @ R0=block
 293         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 294         sub r14, r14, #16
 295         bne __row_loop
 296
 297
 298
 299         @@ at this point, R0=block, R1-R11 (free)
 300         @@     R12=__const_ptr_, R14=&block[n]
 301         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 302 __col_loop:
 303
 304 __b_evaluation2:
 305         @@ at this point, R0=block (temp),  R1-R11 (free)
 306         @@     R12=__const_ptr_, R14=&block[n]
 307         @@ proceed with b0-b3 first, followed by a0-a3
 308         @@ MUL16(b0, W1, col[8x1]);
 309         @@ MUL16(b1, W3, col[8x1]);
 310         @@ MUL16(b2, W5, col[8x1]);
 311         @@ MUL16(b3, W7, col[8x1]);
 312         @@ MAC16(b0, W3, col[8x3]);
 313         @@ MAC16(b1, -W7, col[8x3]);
 314         @@ MAC16(b2, -W1, col[8x3]);
 315         @@ MAC16(b3, -W5, col[8x3]);
 316         ldr r8, [r12, #offW1]    @ R8=W1
 317         ldrsh r7, [r14, #16]
 318         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 319         ldr r9, [r12, #offW3]    @ R9=W3
 320         ldr r10, [r12, #offW5]   @ R10=W5
 321         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 322         ldr r11, [r12, #offW7]   @ R11=W7
 323         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 324         ldrsh r2, [r14, #48]
 325         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 326         teq r2, #0               @ if 0, then avoid muls
 327         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 328         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 329         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 330         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 331         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 332
 333         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 334         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 335         @@     R12=__const_ptr_, R14=&block[n]
 336         @@ MAC16(b0, W5, col[5x8]);
 337         @@ MAC16(b2, W7, col[5x8]);
 338         @@ MAC16(b3, W3, col[5x8]);
 339         @@ MAC16(b1, -W1, col[5x8]);
 340         @@ MAC16(b0, W7, col[7x8]);
 341         @@ MAC16(b2, W3, col[7x8]);
 342         @@ MAC16(b3, -W1, col[7x8]);
 343         @@ MAC16(b1, -W5, col[7x8]);
 344         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 345         teq r3, #0               @ if 0 then avoid muls
 346         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 347         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 348         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 349         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 350         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 351         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 352         @@ R3 is free now
 353         teq r4, #0               @ if 0 then avoid muls
 354         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 355         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 356         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 357         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 358         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 359         @@ R4 is free now
 360 __end_b_evaluation2:
 361         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 362         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 363         @@     R12=__const_ptr_, R14=&block[n]
 364
 365 __a_evaluation2:
 366         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 367         @@ a1 = a0 + W6 * row[2];
 368         @@ a2 = a0 - W6 * row[2];
 369         @@ a3 = a0 - W2 * row[2];
 370         @@ a0 = a0 + W2 * row[2];
 371         ldrsh r6, [r14, #0]
 372         ldr r9, [r12, #offW4]    @ R9=W4
 373         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 374         ldr r10, [r12, #offW6]   @ R10=W6
 375         ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
 376         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 377         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 378         ldr r8, [r12, #offW2]    @ R8=W2
 379         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 380         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 381         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 382         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 383         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 384
 385         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 386         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 387         @@     R12=__const_ptr_, R14=&block[n]
 388         @@ a0 += W4*row[4]
 389         @@ a1 -= W4*row[4]
 390         @@ a2 -= W4*row[4]
 391         @@ a3 += W4*row[4]
 392         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
 393         teq r11, #0              @ if null avoid muls
 394         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 395         @@ R9 is free now
 396         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 397         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 398         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 399         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 400         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 401         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 402         teq r9, #0               @ if null avoid muls
 403         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 404         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 405         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 406         @@ a0 += W6*row[6];
 407         @@ a3 -= W6*row[6];
 408         @@ a1 -= W2*row[6];
 409         @@ a2 += W2*row[6];
 410         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 411         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 412         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 413 __end_a_evaluation2:
 414         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 415         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 416         @@     R12=__const_ptr_, R14=&block[n]
 417         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 418         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 419         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 420         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 421         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 422         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 423         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 424         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 425         @@@@@ no optimisation here @@@@@
 426         add r8, r6, r0           @ R8=a0+b0
 427         add r9, r2, r1           @ R9=a1+b1
 428         mov r8, r8, asr #COL_SHIFT
 429         mov r9, r9, asr #COL_SHIFT
 430         strh r8, [r14, #0]
 431         strh r9, [r14, #16]
 432         add r8, r3, r5           @ R8=a2+b2
 433         add r9, r4, r7           @ R9=a3+b3
 434         mov r8, r8, asr #COL_SHIFT
 435         mov r9, r9, asr #COL_SHIFT
 436         strh r8, [r14, #32]
 437         strh r9, [r14, #48]
 438         sub r8, r4, r7           @ R8=a3-b3
 439         sub r9, r3, r5           @ R9=a2-b2
 440         mov r8, r8, asr #COL_SHIFT
 441         mov r9, r9, asr #COL_SHIFT
 442         strh r8, [r14, #64]
 443         strh r9, [r14, #80]
 444         sub r8, r2, r1           @ R8=a1-b1
 445         sub r9, r6, r0           @ R9=a0-b0
 446         mov r8, r8, asr #COL_SHIFT
 447         mov r9, r9, asr #COL_SHIFT
 448         strh r8, [r14, #96]
 449         strh r9, [r14, #112]
 450
 451 __end_col_loop:
 452         @@ at this point, R0-R11 (free)
 453         @@     R12=__const_ptr_, R14=&block[n]
 454         ldr r0, [sp, #0]         @ R0=block
 455         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 456         sub r14, r14, #2
 457         bne __col_loop
 458
 459
 460
 461
 462 __end_simple_idct_ARM:
 463         @@ restore registers to previous status!
 464         add sp, sp, #8 @@ the local variables!
 465         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 466
 467
 468
 469 @@ kind of sub-function, here not to overload the common case.
 470 __end_bef_a_evaluation:
 471         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 472         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 473         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 474         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 475         bal __end_a_evaluation
 476
 477
 478 __constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 479         .align
 480         .word   W1
 481         .word   W2
 482         .word   W3
 483         .word   W4
 484         .word   W5
 485         .word   W6
 486         .word   W7
 487         .word   MASK_MSHW