git.sesse.net Git - ffmpeg/blob - libavcodec/arm/simple_idct_arm.S

   1 /*
   2  * Copyright (C) 2002 Frederic 'dilb' Boulay
   3  *
   4  * Author: Frederic Boulay <dilb@handhelds.org>
   5  *
   6  * The function defined in this file is derived from the simple_idct function
   7  * from the libavcodec library part of the Libav project.
   8  *
   9  * This file is part of Libav.
  10  *
  11  * Libav is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * Libav is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with Libav; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "asm.S"
  27
  28 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  29 /* the end of the source code.*/
  30 #define W1  22725
  31 #define W2  21407
  32 #define W3  19266
  33 #define W4  16383
  34 #define W5  12873
  35 #define W6  8867
  36 #define W7  4520
  37 #define MASK_MSHW 0xFFFF0000
  38
  39 /* offsets of the constants in the vector */
  40 #define offW1  0
  41 #define offW2  4
  42 #define offW3  8
  43 #define offW4  12
  44 #define offW5  16
  45 #define offW6  20
  46 #define offW7  24
  47 #define offMASK_MSHW 28
  48
  49 #define ROW_SHIFT 11
  50 #define ROW_SHIFT2MSHW (16-11)
  51 #define COL_SHIFT 20
  52 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  53 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  54
  55
  56         .text
  57
  58 function ff_simple_idct_arm, export=1
  59         @@ void simple_idct_arm(int16_t *block)
  60         @@ save stack for reg needed (take all of them),
  61         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  62         @@ so it must not be overwritten, if it is not saved!!
  63         @@ R12 is another scratch register, so it should not be saved too
  64         @@ save all registers
  65         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  66         @@ at this point, R0=block, other registers are free.
  67         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  68         adr r12, __constant_ptr__ @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  69         @@ add 2 temporary variables in the stack: R0 and R14
  70         sub sp, sp, #8          @ allow 2 local variables
  71         str r0, [sp, #0]        @ save block in sp[0]
  72         @@ stack status
  73         @@ sp+4   free
  74         @@ sp+0   R0  (block)
  75
  76
  77         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  78
  79
  80 __row_loop:
  81         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  82         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  83         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  84         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  85         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  86         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  87         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  88         @@ else follow the complete algorithm.
  89         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  90         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  91         orr r5, r4, r3           @ R5=R4 | R3
  92         orr r5, r5, r2           @ R5=R4 | R3 | R2
  93         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  94         beq __end_row_loop
  95         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  96         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  97         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  98         beq __almost_empty_row
  99
 100 __b_evaluation:
 101         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 102         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 103         @@     R12=__const_ptr_, R14=&block[n]
 104         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 105
 106         @@ MUL16(b0, W1, row[1]);
 107         @@ MUL16(b1, W3, row[1]);
 108         @@ MUL16(b2, W5, row[1]);
 109         @@ MUL16(b3, W7, row[1]);
 110         @@ MAC16(b0, W3, row[3]);
 111         @@ MAC16(b1, -W7, row[3]);
 112         @@ MAC16(b2, -W1, row[3]);
 113         @@ MAC16(b3, -W5, row[3]);
 114         ldr r8, [r12, #offW1]    @ R8=W1
 115         mov r2, r2, asr #16      @ R2=ROWr16[3]
 116         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 117         ldr r9, [r12, #offW3]    @ R9=W3
 118         ldr r10, [r12, #offW5]   @ R10=W5
 119         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 120         ldr r11, [r12, #offW7]   @ R11=W7
 121         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 122         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 123         teq r2, #0               @ if null avoid muls
 124         itttt ne
 125         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 126         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 127         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 128         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 129         it    ne
 130         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 131
 132         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 133         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 134         @@     R12=__const_ptr_, R14=&block[n]
 135         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 136         @@ if (temp != 0) {}
 137         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 138         beq __end_b_evaluation
 139
 140         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 141         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 142         @@     R12=__const_ptr_, R14=&block[n]
 143         @@ MAC16(b0, W5, row[5]);
 144         @@ MAC16(b2, W7, row[5]);
 145         @@ MAC16(b3, W3, row[5]);
 146         @@ MAC16(b1, -W1, row[5]);
 147         @@ MAC16(b0, W7, row[7]);
 148         @@ MAC16(b2, W3, row[7]);
 149         @@ MAC16(b3, -W1, row[7]);
 150         @@ MAC16(b1, -W5, row[7]);
 151         mov r3, r3, asr #16      @ R3=ROWr16[5]
 152         teq r3, #0               @ if null avoid muls
 153         it    ne
 154         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 155         mov r4, r4, asr #16      @ R4=ROWr16[7]
 156         itttt ne
 157         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 158         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 159         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 160         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 161         @@ R3 is free now
 162         teq r4, #0               @ if null avoid muls
 163         itttt ne
 164         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 165         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 166         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 167         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 168         it    ne
 169         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 170         @@ R4 is free now
 171 __end_b_evaluation:
 172         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 173         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 174         @@     R12=__const_ptr_, R14=&block[n]
 175
 176 __a_evaluation:
 177         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 178         @@ a1 = a0 + W6 * row[2];
 179         @@ a2 = a0 - W6 * row[2];
 180         @@ a3 = a0 - W2 * row[2];
 181         @@ a0 = a0 + W2 * row[2];
 182         ldr r9, [r12, #offW4]    @ R9=W4
 183         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 184         ldr r10, [r12, #offW6]   @ R10=W6
 185         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 186         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 187
 188         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 189         ldr r8, [r12, #offW2]    @ R8=W2
 190         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 191         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 192         @@ if (temp != 0) {}
 193         teq r2, #0
 194         beq __end_bef_a_evaluation
 195
 196         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 197         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 198         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 199         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 200
 201
 202         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 203         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 204         @@     R12=__const_ptr_, R14=&block[n]
 205
 206
 207         @@ a0 += W4*row[4]
 208         @@ a1 -= W4*row[4]
 209         @@ a2 -= W4*row[4]
 210         @@ a3 += W4*row[4]
 211         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 212         teq r11, #0              @ if null avoid muls
 213         it    ne
 214         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 215         @@ R9 is free now
 216         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 217         itttt ne
 218         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 219         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 220         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 221         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 222         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 223         teq r9, #0               @ if null avoid muls
 224         itttt ne
 225         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 226         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 227         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 228         @@ a0 += W6*row[6];
 229         @@ a3 -= W6*row[6];
 230         @@ a1 -= W2*row[6];
 231         @@ a2 += W2*row[6];
 232         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 233         itt   ne
 234         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 235         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 236
 237 __end_a_evaluation:
 238         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 239         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 240         @@     R12=__const_ptr_, R14=&block[n]
 241         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 242         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 243         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 244         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 245         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 246         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 247         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 248         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 249         add r8, r6, r0           @ R8=a0+b0
 250         add r9, r2, r1           @ R9=a1+b1
 251         @@ put 2 16 bits half-words in a 32bits word
 252         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 253         ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 254         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 255         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 256         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 257         orr r8, r8, r9
 258         str r8, [r14, #0]
 259
 260         add r8, r3, r5           @ R8=a2+b2
 261         add r9, r4, r7           @ R9=a3+b3
 262         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 263         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 264         orr r8, r8, r9
 265         str r8, [r14, #4]
 266
 267         sub r8, r4, r7           @ R8=a3-b3
 268         sub r9, r3, r5           @ R9=a2-b2
 269         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 270         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 271         orr r8, r8, r9
 272         str r8, [r14, #8]
 273
 274         sub r8, r2, r1           @ R8=a1-b1
 275         sub r9, r6, r0           @ R9=a0-b0
 276         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 277         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 278         orr r8, r8, r9
 279         str r8, [r14, #12]
 280
 281         bal __end_row_loop
 282
 283 __almost_empty_row:
 284         @@ the row was empty, except ROWr16[0], now, management of this special case
 285         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 286         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 287         @@                R8=0xFFFF (temp), R9-R11 free
 288         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 289         sub r8, r8, #1           @ R8 is now ready.
 290         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 291         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 292         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 293         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 294         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 295         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 296
 297 __end_row_loop:
 298         @@ at this point, R0-R11 (free)
 299         @@     R12=__const_ptr_, R14=&block[n]
 300         ldr r0, [sp, #0]         @ R0=block
 301         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 302         sub r14, r14, #16
 303         bne __row_loop
 304
 305
 306
 307         @@ at this point, R0=block, R1-R11 (free)
 308         @@     R12=__const_ptr_, R14=&block[n]
 309         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 310 __col_loop:
 311
 312 __b_evaluation2:
 313         @@ at this point, R0=block (temp),  R1-R11 (free)
 314         @@     R12=__const_ptr_, R14=&block[n]
 315         @@ proceed with b0-b3 first, followed by a0-a3
 316         @@ MUL16(b0, W1, col[8x1]);
 317         @@ MUL16(b1, W3, col[8x1]);
 318         @@ MUL16(b2, W5, col[8x1]);
 319         @@ MUL16(b3, W7, col[8x1]);
 320         @@ MAC16(b0, W3, col[8x3]);
 321         @@ MAC16(b1, -W7, col[8x3]);
 322         @@ MAC16(b2, -W1, col[8x3]);
 323         @@ MAC16(b3, -W5, col[8x3]);
 324         ldr r8, [r12, #offW1]    @ R8=W1
 325         ldrsh r7, [r14, #16]
 326         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 327         ldr r9, [r12, #offW3]    @ R9=W3
 328         ldr r10, [r12, #offW5]   @ R10=W5
 329         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 330         ldr r11, [r12, #offW7]   @ R11=W7
 331         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 332         ldrsh r2, [r14, #48]
 333         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 334         teq r2, #0               @ if 0, then avoid muls
 335         itttt ne
 336         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 337         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 338         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 339         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 340         it    ne
 341         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 342
 343         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 344         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 345         @@     R12=__const_ptr_, R14=&block[n]
 346         @@ MAC16(b0, W5, col[5x8]);
 347         @@ MAC16(b2, W7, col[5x8]);
 348         @@ MAC16(b3, W3, col[5x8]);
 349         @@ MAC16(b1, -W1, col[5x8]);
 350         @@ MAC16(b0, W7, col[7x8]);
 351         @@ MAC16(b2, W3, col[7x8]);
 352         @@ MAC16(b3, -W1, col[7x8]);
 353         @@ MAC16(b1, -W5, col[7x8]);
 354         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 355         teq r3, #0               @ if 0 then avoid muls
 356         itttt ne
 357         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 358         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 359         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 360         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 361         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 362         it    ne
 363         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 364         @@ R3 is free now
 365         teq r4, #0               @ if 0 then avoid muls
 366         itttt ne
 367         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 368         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 369         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 370         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 371         it    ne
 372         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 373         @@ R4 is free now
 374 __end_b_evaluation2:
 375         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 376         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 377         @@     R12=__const_ptr_, R14=&block[n]
 378
 379 __a_evaluation2:
 380         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 381         @@ a1 = a0 + W6 * row[2];
 382         @@ a2 = a0 - W6 * row[2];
 383         @@ a3 = a0 - W2 * row[2];
 384         @@ a0 = a0 + W2 * row[2];
 385         ldrsh r6, [r14, #0]
 386         ldr r9, [r12, #offW4]    @ R9=W4
 387         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 388         ldr r10, [r12, #offW6]   @ R10=W6
 389         ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
 390         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 391         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 392         ldr r8, [r12, #offW2]    @ R8=W2
 393         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 394         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 395         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 396         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 397         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 398
 399         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 400         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 401         @@     R12=__const_ptr_, R14=&block[n]
 402         @@ a0 += W4*row[4]
 403         @@ a1 -= W4*row[4]
 404         @@ a2 -= W4*row[4]
 405         @@ a3 += W4*row[4]
 406         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
 407         teq r11, #0              @ if null avoid muls
 408         itttt ne
 409         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 410         @@ R9 is free now
 411         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 412         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 413         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 414         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 415         it    ne
 416         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 417         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 418         teq r9, #0               @ if null avoid muls
 419         itttt ne
 420         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 421         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 422         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 423         @@ a0 += W6*row[6];
 424         @@ a3 -= W6*row[6];
 425         @@ a1 -= W2*row[6];
 426         @@ a2 += W2*row[6];
 427         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 428         itt   ne
 429         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 430         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 431 __end_a_evaluation2:
 432         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 433         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 434         @@     R12=__const_ptr_, R14=&block[n]
 435         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 436         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 437         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 438         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 439         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 440         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 441         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 442         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 443         @@@@@ no optimization here @@@@@
 444         add r8, r6, r0           @ R8=a0+b0
 445         add r9, r2, r1           @ R9=a1+b1
 446         mov r8, r8, asr #COL_SHIFT
 447         mov r9, r9, asr #COL_SHIFT
 448         strh r8, [r14, #0]
 449         strh r9, [r14, #16]
 450         add r8, r3, r5           @ R8=a2+b2
 451         add r9, r4, r7           @ R9=a3+b3
 452         mov r8, r8, asr #COL_SHIFT
 453         mov r9, r9, asr #COL_SHIFT
 454         strh r8, [r14, #32]
 455         strh r9, [r14, #48]
 456         sub r8, r4, r7           @ R8=a3-b3
 457         sub r9, r3, r5           @ R9=a2-b2
 458         mov r8, r8, asr #COL_SHIFT
 459         mov r9, r9, asr #COL_SHIFT
 460         strh r8, [r14, #64]
 461         strh r9, [r14, #80]
 462         sub r8, r2, r1           @ R8=a1-b1
 463         sub r9, r6, r0           @ R9=a0-b0
 464         mov r8, r8, asr #COL_SHIFT
 465         mov r9, r9, asr #COL_SHIFT
 466         strh r8, [r14, #96]
 467         strh r9, [r14, #112]
 468
 469 __end_col_loop:
 470         @@ at this point, R0-R11 (free)
 471         @@     R12=__const_ptr_, R14=&block[n]
 472         ldr r0, [sp, #0]         @ R0=block
 473         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 474         sub r14, r14, #2
 475         bne __col_loop
 476
 477
 478
 479
 480 __end_simple_idct_arm:
 481         @@ restore registers to previous status!
 482         add sp, sp, #8 @@ the local variables!
 483         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 484
 485
 486
 487 @@ kind of sub-function, here not to overload the common case.
 488 __end_bef_a_evaluation:
 489         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 490         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 491         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 492         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 493         bal __end_a_evaluation
 494
 495
 496 __constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 497         .align
 498         .word   W1
 499         .word   W2
 500         .word   W3
 501         .word   W4
 502         .word   W5
 503         .word   W6
 504         .word   W7
 505         .word   MASK_MSHW