- const vector float vec_0_298631336 = (vector float)(FOUR_INSTANCES(0.298631336f));
- const vector float vec_0_390180644 = (vector float)(FOUR_INSTANCES(-0.390180644f));
- const vector float vec_0_541196100 = (vector float)(FOUR_INSTANCES(0.541196100f));
- const vector float vec_0_765366865 = (vector float)(FOUR_INSTANCES(0.765366865f));
- const vector float vec_0_899976223 = (vector float)(FOUR_INSTANCES(-0.899976223f));
- const vector float vec_1_175875602 = (vector float)(FOUR_INSTANCES(1.175875602f));
- const vector float vec_1_501321110 = (vector float)(FOUR_INSTANCES(1.501321110f));
- const vector float vec_1_847759065 = (vector float)(FOUR_INSTANCES(-1.847759065f));
- const vector float vec_1_961570560 = (vector float)(FOUR_INSTANCES(-1.961570560f));
- const vector float vec_2_053119869 = (vector float)(FOUR_INSTANCES(2.053119869f));
- const vector float vec_2_562915447 = (vector float)(FOUR_INSTANCES(-2.562915447f));
- const vector float vec_3_072711026 = (vector float)(FOUR_INSTANCES(3.072711026f));
-
-
- int whichPass, whichHalf;
-
- for(whichPass = 1; whichPass<=2; whichPass++)
- {
- for(whichHalf = 1; whichHalf<=2; whichHalf++)
- {
- vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- vector float tmp10, tmp11, tmp12, tmp13;
- vector float z1, z2, z3, z4, z5;
-
- tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
- tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
- tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
- tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
- tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
- tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
- tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
- tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
-
- tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
- tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
- tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
- tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
-
-
- // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
- row0 = vec_add(tmp10, tmp11);
-
- // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
- row4 = vec_sub(tmp10, tmp11);
-
-
- // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
- z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
-
- // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
- // CONST_BITS-PASS1_BITS);
- row2 = vec_madd(tmp13, vec_0_765366865, z1);
-
- // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
- // CONST_BITS-PASS1_BITS);
- row6 = vec_madd(tmp12, vec_1_847759065, z1);
-
- z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
- z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
- z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
- z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
-
- // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
- z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
-
- // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
- z3 = vec_madd(z3, vec_1_961570560, z5);
-
- // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
- z4 = vec_madd(z4, vec_0_390180644, z5);
-
- // The following adds are rolled into the multiplies above
- // z3 = vec_add(z3, z5); // z3 += z5;
- // z4 = vec_add(z4, z5); // z4 += z5;
-
- // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
- // Wow! It's actually more effecient to roll this multiply
- // into the adds below, even thought the multiply gets done twice!
- // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
-
- // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
- // Same with this one...
- // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
-
- // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
- // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
- row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
-
- // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
- // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
- row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
-
- // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
- // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
- row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
-
- // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
- // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
- row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
-
- // Swap the row values with the alts. If this is the first half,
- // this sets up the low values to be acted on in the second half.
- // If this is the second half, it puts the high values back in
- // the row values where they are expected to be when we're done.
- SWAP(row0, alt0);
- SWAP(row1, alt1);
- SWAP(row2, alt2);
- SWAP(row3, alt3);
- SWAP(row4, alt4);
- SWAP(row5, alt5);
- SWAP(row6, alt6);
- SWAP(row7, alt7);
- }
-
- if (whichPass == 1)
- {
- // transpose the data for the second pass
-
- // First, block transpose the upper right with lower left.
- SWAP(row4, alt0);
- SWAP(row5, alt1);
- SWAP(row6, alt2);
- SWAP(row7, alt3);
-
- // Now, transpose each block of four
- TRANSPOSE4(row0, row1, row2, row3);
- TRANSPOSE4(row4, row5, row6, row7);
- TRANSPOSE4(alt0, alt1, alt2, alt3);
- TRANSPOSE4(alt4, alt5, alt6, alt7);
- }
+ register const vector signed short vczero = (const vector signed short)vec_splat_s16(0);
+ DECLARE_ALIGNED(16, short, qmul8) = qmul;
+ DECLARE_ALIGNED(16, short, qadd8) = qadd;
+ register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
+ register vector bool short blockv_null, blockv_neg;
+ register short backup_0 = block[0];
+ register int j = 0;
+
+ qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
+ qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
+ nqaddv = vec_sub(vczero, qaddv);
+
+ // vectorize all the 16 bytes-aligned blocks
+ // of 8 elements
+ for(; (j + 7) <= nCoeffs ; j+=8) {
+ blockv = vec_ld(j << 1, block);
+ blockv_neg = vec_cmplt(blockv, vczero);
+ blockv_null = vec_cmpeq(blockv, vczero);
+ // choose between +qadd or -qadd as the third operand
+ temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
+ // multiply & add (block{i,i+7} * qmul [+-] qadd)
+ temp1 = vec_mladd(blockv, qmulv, temp1);
+ // put 0 where block[{i,i+7} used to have 0
+ blockv = vec_sel(temp1, blockv, blockv_null);
+ vec_st(blockv, j << 1, block);