git.sesse.net Git - ffmpeg/blob - libavcodec/arm/simple_idct_arm.S

   1 /*
   2  * Copyright (C) 2002 Frederic 'dilb' Boulay
   3  *
   4  * Author: Frederic Boulay <dilb@handhelds.org>
   5  *
   6  * The function defined in this file is derived from the simple_idct function
   7  * from the libavcodec library part of the FFmpeg project.
   8  *
   9  * This file is part of FFmpeg.
  10  *
  11  * FFmpeg is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * FFmpeg is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with FFmpeg; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/arm/asm.S"
  27
  28 /* useful constants for the algorithm */
  29 #define W1  22725
  30 #define W2  21407
  31 #define W3  19266
  32 #define W4  16383
  33 #define W5  12873
  34 #define W6  8867
  35 #define W7  4520
  36 #define MASK_MSHW 0xFFFF0000
  37
  38 #define ROW_SHIFT 11
  39 #define ROW_SHIFT2MSHW (16-11)
  40 #define COL_SHIFT 20
  41 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  42 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  43
  44
  45 function ff_simple_idct_arm, export=1
  46         @@ void simple_idct_arm(int16_t *block)
  47         @@ save stack for reg needed (take all of them),
  48         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  49         @@ so it must not be overwritten, if it is not saved!!
  50         @@ R12 is another scratch register, so it should not be saved too
  51         @@ save all registers
  52         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  53         @@ at this point, R0=block, other registers are free.
  54         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  55         @@ add 2 temporary variables in the stack: R0 and R14
  56         sub sp, sp, #8          @ allow 2 local variables
  57         str r0, [sp, #0]        @ save block in sp[0]
  58         @@ stack status
  59         @@ sp+4   free
  60         @@ sp+0   R0  (block)
  61
  62
  63         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  64
  65
  66 __row_loop:
  67         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :)
  68         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  69         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  70         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  71         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  72         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  73         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  74         @@ else follow the complete algorithm.
  75         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  76         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  77         orr r5, r4, r3           @ R5=R4 | R3
  78         orr r5, r5, r2           @ R5=R4 | R3 | R2
  79         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  80         beq __end_row_loop
  81         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  82         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  83         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  84         beq __almost_empty_row
  85
  86 @@ __b_evaluation:
  87         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
  88         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
  89         @@     R12=__const_ptr_, R14=&block[n]
  90         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
  91
  92         @@ MUL16(b0, W1, row[1]);
  93         @@ MUL16(b1, W3, row[1]);
  94         @@ MUL16(b2, W5, row[1]);
  95         @@ MUL16(b3, W7, row[1]);
  96         @@ MAC16(b0, W3, row[3]);
  97         @@ MAC16(b1, -W7, row[3]);
  98         @@ MAC16(b2, -W1, row[3]);
  99         @@ MAC16(b3, -W5, row[3]);
 100         ldr r8, =W1              @ R8=W1
 101         mov r2, r2, asr #16      @ R2=ROWr16[3]
 102         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 103         ldr r9, =W3              @ R9=W3
 104         ldr r10, =W5             @ R10=W5
 105         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 106         ldr r11, =W7             @ R11=W7
 107         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 108         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 109         teq r2, #0               @ if null avoid muls
 110         itttt ne
 111         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 112         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 113         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 114         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 115         it    ne
 116         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 117
 118         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 119         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 120         @@     R12=__const_ptr_, R14=&block[n]
 121         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 122         @@ if (temp != 0) {}
 123         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 124         beq __end_b_evaluation
 125
 126         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 127         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 128         @@     R12=__const_ptr_, R14=&block[n]
 129         @@ MAC16(b0, W5, row[5]);
 130         @@ MAC16(b2, W7, row[5]);
 131         @@ MAC16(b3, W3, row[5]);
 132         @@ MAC16(b1, -W1, row[5]);
 133         @@ MAC16(b0, W7, row[7]);
 134         @@ MAC16(b2, W3, row[7]);
 135         @@ MAC16(b3, -W1, row[7]);
 136         @@ MAC16(b1, -W5, row[7]);
 137         mov r3, r3, asr #16      @ R3=ROWr16[5]
 138         teq r3, #0               @ if null avoid muls
 139         it    ne
 140         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 141         mov r4, r4, asr #16      @ R4=ROWr16[7]
 142         itttt ne
 143         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 144         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 145         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 146         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 147         @@ R3 is free now
 148         teq r4, #0               @ if null avoid muls
 149         itttt ne
 150         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 151         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 152         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 153         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 154         it    ne
 155         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 156         @@ R4 is free now
 157 __end_b_evaluation:
 158         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 159         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 160         @@     R12=__const_ptr_, R14=&block[n]
 161
 162 @@ __a_evaluation:
 163         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 164         @@ a1 = a0 + W6 * row[2];
 165         @@ a2 = a0 - W6 * row[2];
 166         @@ a3 = a0 - W2 * row[2];
 167         @@ a0 = a0 + W2 * row[2];
 168         ldr r9, =W4              @ R9=W4
 169         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 170         ldr r10, =W6             @ R10=W6
 171         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 172         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 173
 174         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 175         ldr r8, =W2              @ R8=W2
 176         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 177         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 178         @@ if (temp != 0) {}
 179         teq r2, #0
 180         beq __end_bef_a_evaluation
 181
 182         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 183         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 184         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 185         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 186
 187
 188         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 189         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 190         @@     R12=__const_ptr_, R14=&block[n]
 191
 192
 193         @@ a0 += W4*row[4]
 194         @@ a1 -= W4*row[4]
 195         @@ a2 -= W4*row[4]
 196         @@ a3 += W4*row[4]
 197         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 198         teq r11, #0              @ if null avoid muls
 199         it    ne
 200         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 201         @@ R9 is free now
 202         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 203         itttt ne
 204         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 205         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 206         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 207         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 208         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 209         teq r9, #0               @ if null avoid muls
 210         itttt ne
 211         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 212         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 213         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 214         @@ a0 += W6*row[6];
 215         @@ a3 -= W6*row[6];
 216         @@ a1 -= W2*row[6];
 217         @@ a2 += W2*row[6];
 218         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 219         itt   ne
 220         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 221         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 222
 223 __end_a_evaluation:
 224         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 225         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 226         @@     R12=__const_ptr_, R14=&block[n]
 227         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 228         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 229         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 230         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 231         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 232         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 233         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 234         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 235         add r8, r6, r0           @ R8=a0+b0
 236         add r9, r2, r1           @ R9=a1+b1
 237         @@ put two 16-bit half-words in a 32-bit word
 238         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!)
 239         ldr r10, =MASK_MSHW      @ R10=0xFFFF0000
 240         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 241         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 242         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 243         orr r8, r8, r9
 244         str r8, [r14, #0]
 245
 246         add r8, r3, r5           @ R8=a2+b2
 247         add r9, r4, r7           @ R9=a3+b3
 248         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 249         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 250         orr r8, r8, r9
 251         str r8, [r14, #4]
 252
 253         sub r8, r4, r7           @ R8=a3-b3
 254         sub r9, r3, r5           @ R9=a2-b2
 255         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 256         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 257         orr r8, r8, r9
 258         str r8, [r14, #8]
 259
 260         sub r8, r2, r1           @ R8=a1-b1
 261         sub r9, r6, r0           @ R9=a0-b0
 262         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 263         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 264         orr r8, r8, r9
 265         str r8, [r14, #12]
 266
 267         bal __end_row_loop
 268
 269 __almost_empty_row:
 270         @@ the row was empty, except ROWr16[0], now, management of this special case
 271         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 272         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 273         @@                R8=0xFFFF (temp), R9-R11 free
 274         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 275         sub r8, r8, #1           @ R8 is now ready.
 276         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 277         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 278         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 279         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 280         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 281         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 282
 283 __end_row_loop:
 284         @@ at this point, R0-R11 (free)
 285         @@     R12=__const_ptr_, R14=&block[n]
 286         ldr r0, [sp, #0]         @ R0=block
 287         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 288         sub r14, r14, #16
 289         bne __row_loop
 290
 291
 292
 293         @@ at this point, R0=block, R1-R11 (free)
 294         @@     R12=__const_ptr_, R14=&block[n]
 295         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 296 __col_loop:
 297
 298 @@ __b_evaluation2:
 299         @@ at this point, R0=block (temp),  R1-R11 (free)
 300         @@     R12=__const_ptr_, R14=&block[n]
 301         @@ proceed with b0-b3 first, followed by a0-a3
 302         @@ MUL16(b0, W1, col[8x1]);
 303         @@ MUL16(b1, W3, col[8x1]);
 304         @@ MUL16(b2, W5, col[8x1]);
 305         @@ MUL16(b3, W7, col[8x1]);
 306         @@ MAC16(b0, W3, col[8x3]);
 307         @@ MAC16(b1, -W7, col[8x3]);
 308         @@ MAC16(b2, -W1, col[8x3]);
 309         @@ MAC16(b3, -W5, col[8x3]);
 310         ldr r8, =W1              @ R8=W1
 311         ldrsh r7, [r14, #16]
 312         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 313         ldr r9, =W3              @ R9=W3
 314         ldr r10, =W5             @ R10=W5
 315         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 316         ldr r11, =W7             @ R11=W7
 317         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 318         ldrsh r2, [r14, #48]
 319         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 320         teq r2, #0               @ if 0, then avoid muls
 321         itttt ne
 322         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 323         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 324         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 325         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 326         it    ne
 327         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 328
 329         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 330         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 331         @@     R12=__const_ptr_, R14=&block[n]
 332         @@ MAC16(b0, W5, col[5x8]);
 333         @@ MAC16(b2, W7, col[5x8]);
 334         @@ MAC16(b3, W3, col[5x8]);
 335         @@ MAC16(b1, -W1, col[5x8]);
 336         @@ MAC16(b0, W7, col[7x8]);
 337         @@ MAC16(b2, W3, col[7x8]);
 338         @@ MAC16(b3, -W1, col[7x8]);
 339         @@ MAC16(b1, -W5, col[7x8]);
 340         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 341         teq r3, #0               @ if 0 then avoid muls
 342         itttt ne
 343         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 344         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 345         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 346         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 347         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 348         it    ne
 349         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 350         @@ R3 is free now
 351         teq r4, #0               @ if 0 then avoid muls
 352         itttt ne
 353         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 354         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 355         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 356         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 357         it    ne
 358         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 359         @@ R4 is free now
 360 @@ __end_b_evaluation2:
 361         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 362         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 363         @@     R12=__const_ptr_, R14=&block[n]
 364
 365 @@ __a_evaluation2:
 366         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 367         @@ a1 = a0 + W6 * row[2];
 368         @@ a2 = a0 - W6 * row[2];
 369         @@ a3 = a0 - W2 * row[2];
 370         @@ a0 = a0 + W2 * row[2];
 371         ldrsh r6, [r14, #0]
 372         ldr r9, =W4              @ R9=W4
 373         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 374         ldr r10, =W6             @ R10=W6
 375         ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
 376         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 377         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 378         ldr r8, =W2              @ R8=W2
 379         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 380         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 381         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 382         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 383         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 384
 385         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 386         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 387         @@     R12=__const_ptr_, R14=&block[n]
 388         @@ a0 += W4*row[4]
 389         @@ a1 -= W4*row[4]
 390         @@ a2 -= W4*row[4]
 391         @@ a3 += W4*row[4]
 392         ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
 393         teq r11, #0              @ if null avoid muls
 394         itttt ne
 395         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 396         @@ R9 is free now
 397         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 398         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 399         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 400         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 401         it    ne
 402         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 403         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 404         teq r9, #0               @ if null avoid muls
 405         itttt ne
 406         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 407         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 408         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 409         @@ a0 += W6*row[6];
 410         @@ a3 -= W6*row[6];
 411         @@ a1 -= W2*row[6];
 412         @@ a2 += W2*row[6];
 413         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 414         itt   ne
 415         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 416         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 417 @@ __end_a_evaluation2:
 418         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 419         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 420         @@     R12=__const_ptr_, R14=&block[n]
 421         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 422         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 423         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 424         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 425         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 426         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 427         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 428         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 429         @@@@@ no optimization here @@@@@
 430         add r8, r6, r0           @ R8=a0+b0
 431         add r9, r2, r1           @ R9=a1+b1
 432         mov r8, r8, asr #COL_SHIFT
 433         mov r9, r9, asr #COL_SHIFT
 434         strh r8, [r14, #0]
 435         strh r9, [r14, #16]
 436         add r8, r3, r5           @ R8=a2+b2
 437         add r9, r4, r7           @ R9=a3+b3
 438         mov r8, r8, asr #COL_SHIFT
 439         mov r9, r9, asr #COL_SHIFT
 440         strh r8, [r14, #32]
 441         strh r9, [r14, #48]
 442         sub r8, r4, r7           @ R8=a3-b3
 443         sub r9, r3, r5           @ R9=a2-b2
 444         mov r8, r8, asr #COL_SHIFT
 445         mov r9, r9, asr #COL_SHIFT
 446         strh r8, [r14, #64]
 447         strh r9, [r14, #80]
 448         sub r8, r2, r1           @ R8=a1-b1
 449         sub r9, r6, r0           @ R9=a0-b0
 450         mov r8, r8, asr #COL_SHIFT
 451         mov r9, r9, asr #COL_SHIFT
 452         strh r8, [r14, #96]
 453         strh r9, [r14, #112]
 454
 455 @@ __end_col_loop:
 456         @@ at this point, R0-R11 (free)
 457         @@     R12=__const_ptr_, R14=&block[n]
 458         ldr r0, [sp, #0]         @ R0=block
 459         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 460         sub r14, r14, #2
 461         bne __col_loop
 462
 463
 464
 465
 466 @@ __end_simple_idct_arm:
 467         @@ restore registers to previous status!
 468         add sp, sp, #8 @@ the local variables!
 469         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 470
 471
 472
 473 @@ kind of sub-function, here not to overload the common case.
 474 __end_bef_a_evaluation:
 475         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 476         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 477         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 478         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 479         bal __end_a_evaluation
 480 endfunc