- int a0, a1, a2, a3, b0, b1, b2, b3;
-
- if( !( ((uint32_t*)row)[0]|((uint32_t*)row)[1] |((uint32_t*)row)[2] |((uint32_t*)row)[3])) {
-/* row[0] = row[1] = row[2] = row[3] = row[4] =
- row[5] = row[6] = row[7] = 0;*/
- return 0;
- }
-
- if(!( ((uint32_t*)row)[2] |((uint32_t*)row)[3] )){
- a0 = W4*row[0] + W2*row[2] + (1<<(ROW_SHIFT-1));
- a1 = W4*row[0] + W6*row[2] + (1<<(ROW_SHIFT-1));
- a2 = W4*row[0] - W6*row[2] + (1<<(ROW_SHIFT-1));
- a3 = W4*row[0] - W2*row[2] + (1<<(ROW_SHIFT-1));
-
- b0 = W1*row[1] + W3*row[3];
- b1 = W3*row[1] - W7*row[3];
- b2 = W5*row[1] - W1*row[3];
- b3 = W7*row[1] - W5*row[3];
- }else{
- a0 = W4*row[0] + W2*row[2] + W4*row[4] + W6*row[6] + (1<<(ROW_SHIFT-1));
- a1 = W4*row[0] + W6*row[2] - W4*row[4] - W2*row[6] + (1<<(ROW_SHIFT-1));
- a2 = W4*row[0] - W6*row[2] - W4*row[4] + W2*row[6] + (1<<(ROW_SHIFT-1));
- a3 = W4*row[0] - W2*row[2] + W4*row[4] - W6*row[6] + (1<<(ROW_SHIFT-1));
-
- b0 = W1*row[1] + W3*row[3] + W5*row[5] + W7*row[7];
- b1 = W3*row[1] - W7*row[3] - W1*row[5] - W5*row[7];
- b2 = W5*row[1] - W1*row[3] + W7*row[5] + W3*row[7];
- b3 = W7*row[1] - W5*row[3] + W3*row[5] - W1*row[7];
- }
-
- row[0] = (a0 + b0) >> ROW_SHIFT;
- row[1] = (a1 + b1) >> ROW_SHIFT;
- row[2] = (a2 + b2) >> ROW_SHIFT;
- row[3] = (a3 + b3) >> ROW_SHIFT;
- row[4] = (a3 - b3) >> ROW_SHIFT;
- row[5] = (a2 - b2) >> ROW_SHIFT;
- row[6] = (a1 - b1) >> ROW_SHIFT;
- row[7] = (a0 - b0) >> ROW_SHIFT;
-
- return 1;
+ int a0, a1, a2, a3, b0, b1, b2, b3;
+#ifdef HAVE_FAST_64BIT
+ uint64_t temp;
+#else
+ uint32_t temp;
+#endif
+
+#ifdef HAVE_FAST_64BIT
+#ifdef WORDS_BIGENDIAN
+#define ROW0_MASK 0xffff000000000000LL
+#else
+#define ROW0_MASK 0xffffLL
+#endif
+ if(sizeof(DCTELEM)==2){
+ if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
+ ((uint64_t *)row)[1]) == 0) {
+ temp = (row[0] << 3) & 0xffff;
+ temp += temp << 16;
+ temp += temp << 32;
+ ((uint64_t *)row)[0] = temp;
+ ((uint64_t *)row)[1] = temp;
+ return;
+ }
+ }else{
+ if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
+ row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
+ return;
+ }
+ }
+#else
+ if(sizeof(DCTELEM)==2){
+ if (!(((uint32_t*)row)[1] |
+ ((uint32_t*)row)[2] |
+ ((uint32_t*)row)[3] |
+ row[1])) {
+ temp = (row[0] << 3) & 0xffff;
+ temp += temp << 16;
+ ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
+ ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
+ return;
+ }
+ }else{
+ if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
+ row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
+ return;
+ }
+ }
+#endif
+
+ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+ a1 = a0;
+ a2 = a0;
+ a3 = a0;
+
+ /* no need to optimize : gcc does it */
+ a0 += W2 * row[2];
+ a1 += W6 * row[2];
+ a2 -= W6 * row[2];
+ a3 -= W2 * row[2];
+
+ MUL16(b0, W1, row[1]);
+ MAC16(b0, W3, row[3]);
+ MUL16(b1, W3, row[1]);
+ MAC16(b1, -W7, row[3]);
+ MUL16(b2, W5, row[1]);
+ MAC16(b2, -W1, row[3]);
+ MUL16(b3, W7, row[1]);
+ MAC16(b3, -W5, row[3]);
+
+#ifdef HAVE_FAST_64BIT
+ temp = ((uint64_t*)row)[1];
+#else
+ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+#endif
+ if (temp != 0) {
+ a0 += W4*row[4] + W6*row[6];
+ a1 += - W4*row[4] - W2*row[6];
+ a2 += - W4*row[4] + W2*row[6];
+ a3 += W4*row[4] - W6*row[6];
+
+ MAC16(b0, W5, row[5]);
+ MAC16(b0, W7, row[7]);
+
+ MAC16(b1, -W1, row[5]);
+ MAC16(b1, -W5, row[7]);
+
+ MAC16(b2, W7, row[5]);
+ MAC16(b2, W3, row[7]);
+
+ MAC16(b3, W3, row[5]);
+ MAC16(b3, -W1, row[7]);
+ }
+
+ row[0] = (a0 + b0) >> ROW_SHIFT;
+ row[7] = (a0 - b0) >> ROW_SHIFT;
+ row[1] = (a1 + b1) >> ROW_SHIFT;
+ row[6] = (a1 - b1) >> ROW_SHIFT;
+ row[2] = (a2 + b2) >> ROW_SHIFT;
+ row[5] = (a2 - b2) >> ROW_SHIFT;
+ row[3] = (a3 + b3) >> ROW_SHIFT;
+ row[4] = (a3 - b3) >> ROW_SHIFT;