git.sesse.net Git - ffmpeg/blob - libavcodec/arm/simple_idct_arm.S

   1 /*
   2  * simple_idct_arm.S
   3  * Copyright (C) 2002 Frederic 'dilb' Boulay
   4  *
   5  * Author: Frederic Boulay <dilb@handhelds.org>
   6  *
   7  * The function defined in this file is derived from the simple_idct function
   8  * from the libavcodec library part of the FFmpeg project.
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "asm.S"
  28
  29 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  30 /* the end of the source code.*/
  31 #define W1  22725
  32 #define W2  21407
  33 #define W3  19266
  34 #define W4  16383
  35 #define W5  12873
  36 #define W6  8867
  37 #define W7  4520
  38 #define MASK_MSHW 0xFFFF0000
  39
  40 /* offsets of the constants in the vector */
  41 #define offW1  0
  42 #define offW2  4
  43 #define offW3  8
  44 #define offW4  12
  45 #define offW5  16
  46 #define offW6  20
  47 #define offW7  24
  48 #define offMASK_MSHW 28
  49
  50 #define ROW_SHIFT 11
  51 #define ROW_SHIFT2MSHW (16-11)
  52 #define COL_SHIFT 20
  53 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  54 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  55
  56
  57         .text
  58
  59 function ff_simple_idct_arm, export=1
  60         @@ void simple_idct_arm(int16_t *block)
  61         @@ save stack for reg needed (take all of them),
  62         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  63         @@ so it must not be overwritten, if it is not saved!!
  64         @@ R12 is another scratch register, so it should not be saved too
  65         @@ save all registers
  66         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  67         @@ at this point, R0=block, other registers are free.
  68         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  69         adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  70         @@ add 2 temporary variables in the stack: R0 and R14
  71         sub sp, sp, #8          @ allow 2 local variables
  72         str r0, [sp, #0]        @ save block in sp[0]
  73         @@ stack status
  74         @@ sp+4   free
  75         @@ sp+0   R0  (block)
  76
  77
  78         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  79
  80
  81 __row_loop:
  82         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  83         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  84         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  85         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  86         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  87         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  88         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  89         @@ else follow the complete algorithm.
  90         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  91         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  92         orr r5, r4, r3           @ R5=R4 | R3
  93         orr r5, r5, r2           @ R5=R4 | R3 | R2
  94         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  95         beq __end_row_loop
  96         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  97         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  98         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  99         beq __almost_empty_row
 100
 101 __b_evaluation:
 102         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 103         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 104         @@     R12=__const_ptr_, R14=&block[n]
 105         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 106
 107         @@ MUL16(b0, W1, row[1]);
 108         @@ MUL16(b1, W3, row[1]);
 109         @@ MUL16(b2, W5, row[1]);
 110         @@ MUL16(b3, W7, row[1]);
 111         @@ MAC16(b0, W3, row[3]);
 112         @@ MAC16(b1, -W7, row[3]);
 113         @@ MAC16(b2, -W1, row[3]);
 114         @@ MAC16(b3, -W5, row[3]);
 115         ldr r8, [r12, #offW1]    @ R8=W1
 116         mov r2, r2, asr #16      @ R2=ROWr16[3]
 117         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 118         ldr r9, [r12, #offW3]    @ R9=W3
 119         ldr r10, [r12, #offW5]   @ R10=W5
 120         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 121         ldr r11, [r12, #offW7]   @ R11=W7
 122         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 123         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 124         teq r2, #0               @ if null avoid muls
 125         itttt ne
 126         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 127         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 128         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 129         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 130         it    ne
 131         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 132
 133         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 134         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 135         @@     R12=__const_ptr_, R14=&block[n]
 136         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 137         @@ if (temp != 0) {}
 138         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 139         beq __end_b_evaluation
 140
 141         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 142         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 143         @@     R12=__const_ptr_, R14=&block[n]
 144         @@ MAC16(b0, W5, row[5]);
 145         @@ MAC16(b2, W7, row[5]);
 146         @@ MAC16(b3, W3, row[5]);
 147         @@ MAC16(b1, -W1, row[5]);
 148         @@ MAC16(b0, W7, row[7]);
 149         @@ MAC16(b2, W3, row[7]);
 150         @@ MAC16(b3, -W1, row[7]);
 151         @@ MAC16(b1, -W5, row[7]);
 152         mov r3, r3, asr #16      @ R3=ROWr16[5]
 153         teq r3, #0               @ if null avoid muls
 154         it    ne
 155         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 156         mov r4, r4, asr #16      @ R4=ROWr16[7]
 157         itttt ne
 158         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 159         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 160         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 161         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 162         @@ R3 is free now
 163         teq r4, #0               @ if null avoid muls
 164         itttt ne
 165         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 166         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 167         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 168         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 169         it    ne
 170         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 171         @@ R4 is free now
 172 __end_b_evaluation:
 173         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 174         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 175         @@     R12=__const_ptr_, R14=&block[n]
 176
 177 __a_evaluation:
 178         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 179         @@ a1 = a0 + W6 * row[2];
 180         @@ a2 = a0 - W6 * row[2];
 181         @@ a3 = a0 - W2 * row[2];
 182         @@ a0 = a0 + W2 * row[2];
 183         ldr r9, [r12, #offW4]    @ R9=W4
 184         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 185         ldr r10, [r12, #offW6]   @ R10=W6
 186         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 187         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 188
 189         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 190         ldr r8, [r12, #offW2]    @ R8=W2
 191         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 192         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 193         @@ if (temp != 0) {}
 194         teq r2, #0
 195         beq __end_bef_a_evaluation
 196
 197         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 198         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 199         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 200         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 201
 202
 203         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 204         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 205         @@     R12=__const_ptr_, R14=&block[n]
 206
 207
 208         @@ a0 += W4*row[4]
 209         @@ a1 -= W4*row[4]
 210         @@ a2 -= W4*row[4]
 211         @@ a3 += W4*row[4]
 212         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 213         teq r11, #0              @ if null avoid muls
 214         it    ne
 215         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 216         @@ R9 is free now
 217         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 218         itttt ne
 219         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 220         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 221         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 222         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 223         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 224         teq r9, #0               @ if null avoid muls
 225         itttt ne
 226         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 227         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 228         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 229         @@ a0 += W6*row[6];
 230         @@ a3 -= W6*row[6];
 231         @@ a1 -= W2*row[6];
 232         @@ a2 += W2*row[6];
 233         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 234         itt   ne
 235         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 236         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 237
 238 __end_a_evaluation:
 239         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 240         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 241         @@     R12=__const_ptr_, R14=&block[n]
 242         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 243         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 244         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 245         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 246         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 247         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 248         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 249         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 250         add r8, r6, r0           @ R8=a0+b0
 251         add r9, r2, r1           @ R9=a1+b1
 252         @@ put 2 16 bits half-words in a 32bits word
 253         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 254         ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 255         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 256         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 257         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 258         orr r8, r8, r9
 259         str r8, [r14, #0]
 260
 261         add r8, r3, r5           @ R8=a2+b2
 262         add r9, r4, r7           @ R9=a3+b3
 263         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 264         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 265         orr r8, r8, r9
 266         str r8, [r14, #4]
 267
 268         sub r8, r4, r7           @ R8=a3-b3
 269         sub r9, r3, r5           @ R9=a2-b2
 270         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 271         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 272         orr r8, r8, r9
 273         str r8, [r14, #8]
 274
 275         sub r8, r2, r1           @ R8=a1-b1
 276         sub r9, r6, r0           @ R9=a0-b0
 277         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 278         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 279         orr r8, r8, r9
 280         str r8, [r14, #12]
 281
 282         bal __end_row_loop
 283
 284 __almost_empty_row:
 285         @@ the row was empty, except ROWr16[0], now, management of this special case
 286         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 287         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 288         @@                R8=0xFFFF (temp), R9-R11 free
 289         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 290         sub r8, r8, #1           @ R8 is now ready.
 291         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 292         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 293         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 294         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 295         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 296         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 297
 298 __end_row_loop:
 299         @@ at this point, R0-R11 (free)
 300         @@     R12=__const_ptr_, R14=&block[n]
 301         ldr r0, [sp, #0]         @ R0=block
 302         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 303         sub r14, r14, #16
 304         bne __row_loop
 305
 306
 307
 308         @@ at this point, R0=block, R1-R11 (free)
 309         @@     R12=__const_ptr_, R14=&block[n]
 310         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 311 __col_loop:
 312
 313 __b_evaluation2:
 314         @@ at this point, R0=block (temp),  R1-R11 (free)
 315         @@     R12=__const_ptr_, R14=&block[n]
 316         @@ proceed with b0-b3 first, followed by a0-a3
 317         @@ MUL16(b0, W1, col[8x1]);
 318         @@ MUL16(b1, W3, col[8x1]);
 319         @@ MUL16(b2, W5, col[8x1]);
 320         @@ MUL16(b3, W7, col[8x1]);
 321         @@ MAC16(b0, W3, col[8x3]);
 322         @@ MAC16(b1, -W7, col[8x3]);
 323         @@ MAC16(b2, -W1, col[8x3]);
 324         @@ MAC16(b3, -W5, col[8x3]);
 325         ldr r8, [r12, #offW1]    @ R8=W1
 326         ldrsh r7, [r14, #16]
 327         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 328         ldr r9, [r12, #offW3]    @ R9=W3
 329         ldr r10, [r12, #offW5]   @ R10=W5
 330         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 331         ldr r11, [r12, #offW7]   @ R11=W7
 332         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 333         ldrsh r2, [r14, #48]
 334         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 335         teq r2, #0               @ if 0, then avoid muls
 336         itttt ne
 337         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 338         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 339         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 340         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 341         it    ne
 342         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 343
 344         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 345         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 346         @@     R12=__const_ptr_, R14=&block[n]
 347         @@ MAC16(b0, W5, col[5x8]);
 348         @@ MAC16(b2, W7, col[5x8]);
 349         @@ MAC16(b3, W3, col[5x8]);
 350         @@ MAC16(b1, -W1, col[5x8]);
 351         @@ MAC16(b0, W7, col[7x8]);
 352         @@ MAC16(b2, W3, col[7x8]);
 353         @@ MAC16(b3, -W1, col[7x8]);
 354         @@ MAC16(b1, -W5, col[7x8]);
 355         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 356         teq r3, #0               @ if 0 then avoid muls
 357         itttt ne
 358         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 359         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 360         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 361         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 362         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 363         it    ne
 364         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 365         @@ R3 is free now
 366         teq r4, #0               @ if 0 then avoid muls
 367         itttt ne
 368         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 369         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 370         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 371         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 372         it    ne
 373         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 374         @@ R4 is free now
 375 __end_b_evaluation2:
 376         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 377         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 378         @@     R12=__const_ptr_, R14=&block[n]
 379
 380 __a_evaluation2:
 381         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 382         @@ a1 = a0 + W6 * row[2];
 383         @@ a2 = a0 - W6 * row[2];
 384         @@ a3 = a0 - W2 * row[2];
 385         @@ a0 = a0 + W2 * row[2];
 386         ldrsh r6, [r14, #0]
 387         ldr r9, [r12, #offW4]    @ R9=W4
 388         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 389         ldr r10, [r12, #offW6]   @ R10=W6
 390         ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
 391         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 392         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 393         ldr r8, [r12, #offW2]    @ R8=W2
 394         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 395         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 396         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 397         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 398         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 399
 400         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 401         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 402         @@     R12=__const_ptr_, R14=&block[n]
 403         @@ a0 += W4*row[4]
 404         @@ a1 -= W4*row[4]
 405         @@ a2 -= W4*row[4]
 406         @@ a3 += W4*row[4]
 407         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
 408         teq r11, #0              @ if null avoid muls
 409         itttt ne
 410         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 411         @@ R9 is free now
 412         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 413         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 414         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 415         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 416         it    ne
 417         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 418         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 419         teq r9, #0               @ if null avoid muls
 420         itttt ne
 421         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 422         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 423         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 424         @@ a0 += W6*row[6];
 425         @@ a3 -= W6*row[6];
 426         @@ a1 -= W2*row[6];
 427         @@ a2 += W2*row[6];
 428         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 429         itt   ne
 430         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 431         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 432 __end_a_evaluation2:
 433         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 434         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 435         @@     R12=__const_ptr_, R14=&block[n]
 436         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 437         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 438         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 439         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 440         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 441         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 442         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 443         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 444         @@@@@ no optimization here @@@@@
 445         add r8, r6, r0           @ R8=a0+b0
 446         add r9, r2, r1           @ R9=a1+b1
 447         mov r8, r8, asr #COL_SHIFT
 448         mov r9, r9, asr #COL_SHIFT
 449         strh r8, [r14, #0]
 450         strh r9, [r14, #16]
 451         add r8, r3, r5           @ R8=a2+b2
 452         add r9, r4, r7           @ R9=a3+b3
 453         mov r8, r8, asr #COL_SHIFT
 454         mov r9, r9, asr #COL_SHIFT
 455         strh r8, [r14, #32]
 456         strh r9, [r14, #48]
 457         sub r8, r4, r7           @ R8=a3-b3
 458         sub r9, r3, r5           @ R9=a2-b2
 459         mov r8, r8, asr #COL_SHIFT
 460         mov r9, r9, asr #COL_SHIFT
 461         strh r8, [r14, #64]
 462         strh r9, [r14, #80]
 463         sub r8, r2, r1           @ R8=a1-b1
 464         sub r9, r6, r0           @ R9=a0-b0
 465         mov r8, r8, asr #COL_SHIFT
 466         mov r9, r9, asr #COL_SHIFT
 467         strh r8, [r14, #96]
 468         strh r9, [r14, #112]
 469
 470 __end_col_loop:
 471         @@ at this point, R0-R11 (free)
 472         @@     R12=__const_ptr_, R14=&block[n]
 473         ldr r0, [sp, #0]         @ R0=block
 474         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 475         sub r14, r14, #2
 476         bne __col_loop
 477
 478
 479
 480
 481 __end_simple_idct_arm:
 482         @@ restore registers to previous status!
 483         add sp, sp, #8 @@ the local variables!
 484         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 485
 486
 487
 488 @@ kind of sub-function, here not to overload the common case.
 489 __end_bef_a_evaluation:
 490         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 491         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 492         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 493         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 494         bal __end_a_evaluation
 495
 496
 497 __constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 498         .align
 499         .word   W1
 500         .word   W2
 501         .word   W3
 502         .word   W4
 503         .word   W5
 504         .word   W6
 505         .word   W7
 506         .word   MASK_MSHW