X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fppc%2Fmpegvideo_altivec.c;h=8348e684bd7cd7041bc024f9cdb5bdae0e6c2096;hb=f486321395e3804ceece2a562f4bf1a8d99d5f24;hp=1e05acc2e5127525ebfcf9585a3ffbc694628f6e;hpb=9d16f87ffddff90182d679531a3c3b09f1243471;p=ffmpeg diff --git a/libavcodec/ppc/mpegvideo_altivec.c b/libavcodec/ppc/mpegvideo_altivec.c index 1e05acc2e51..8348e684bd7 100644 --- a/libavcodec/ppc/mpegvideo_altivec.c +++ b/libavcodec/ppc/mpegvideo_altivec.c @@ -23,12 +23,12 @@ #include #include -#include "dsputil.h" -#include "mpegvideo.h" +#include "libavcodec/dsputil.h" +#include "libavcodec/mpegvideo.h" -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" +#include "dsputil_ppc.h" +#include "util_altivec.h" +#include "types_altivec.h" // Swaps two variables (used for altivec registers) #define SWAP(a,b) \ @@ -41,15 +41,15 @@ do { \ // transposes a matrix consisting of four vectors with four elements each #define TRANSPOSE4(a,b,c,d) \ do { \ - __typeof__(a) _trans_ach = vec_mergeh(a, c); \ - __typeof__(a) _trans_acl = vec_mergel(a, c); \ - __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ - __typeof__(a) _trans_bdl = vec_mergel(b, d); \ - \ - a = vec_mergeh(_trans_ach, _trans_bdh); \ - b = vec_mergel(_trans_ach, _trans_bdh); \ - c = vec_mergeh(_trans_acl, _trans_bdl); \ - d = vec_mergel(_trans_acl, _trans_bdl); \ + __typeof__(a) _trans_ach = vec_mergeh(a, c); \ + __typeof__(a) _trans_acl = vec_mergel(a, c); \ + __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ + __typeof__(a) _trans_bdl = vec_mergel(b, d); \ + \ + a = vec_mergeh(_trans_ach, _trans_bdh); \ + b = vec_mergel(_trans_ach, _trans_bdh); \ + c = vec_mergeh(_trans_acl, _trans_bdl); \ + d = vec_mergel(_trans_acl, _trans_bdl); \ } while (0) @@ -58,29 +58,25 @@ do { \ // target address is four-byte aligned (which should be always). #define LOAD4(vec, address) \ { \ - __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ - vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ - vec = vec_ld(0, _load_addr); \ - vec = vec_perm(vec, vec, _perm_vec); \ - vec = vec_splat(vec, 0); \ + __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ + vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ + vec = vec_ld(0, _load_addr); \ + vec = vec_perm(vec, vec, _perm_vec); \ + vec = vec_splat(vec, 0); \ } -#ifdef SYS_DARWIN -#define FOUROF(a) (a) -#else -// slower, for dumb non-apple GCC #define FOUROF(a) {a,a,a,a} -#endif + int dct_quantize_altivec(MpegEncContext* s, - DCTELEM* data, int n, - int qscale, int* overflow) + DCTELEM* data, int n, + int qscale, int* overflow) { int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; - const_vector float zero = (const_vector float)FOUROF(0.); - // used after quantise step + const vector float zero = (const vector float)FOUROF(0.); + // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors @@ -141,10 +137,8 @@ int dct_quantize_altivec(MpegEncContext* s, int whichPass, whichHalf; - for(whichPass = 1; whichPass<=2; whichPass++) - { - for(whichHalf = 1; whichHalf<=2; whichHalf++) - { + for(whichPass = 1; whichPass<=2; whichPass++) { + for(whichHalf = 1; whichHalf<=2; whichHalf++) { vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; vector float tmp10, tmp11, tmp12, tmp13; vector float z1, z2, z3, z4, z5; @@ -201,7 +195,7 @@ int dct_quantize_altivec(MpegEncContext* s, // z4 = vec_add(z4, z5); // z4 += z5; // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - // Wow! It's actually more effecient to roll this multiply + // Wow! It's actually more efficient to roll this multiply // into the adds below, even thought the multiply gets done twice! // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); @@ -239,8 +233,7 @@ int dct_quantize_altivec(MpegEncContext* s, SWAP(row7, alt7); } - if (whichPass == 1) - { + if (whichPass == 1) { // transpose the data for the second pass // First, block transpose the upper right with lower left. @@ -258,15 +251,14 @@ int dct_quantize_altivec(MpegEncContext* s, } } - // perform the quantise step, using the floating point data + // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; const vector signed int* qmat; vector float bias, negBias; - if (s->mb_intra) - { + if (s->mb_intra) { vector signed int baseVector; // We must cache element 0 in the intra case @@ -276,9 +268,7 @@ int dct_quantize_altivec(MpegEncContext* s, qmat = (vector signed int*)s->q_intra_matrix[qscale]; biasAddr = &(s->intra_quant_bias); - } - else - { + } else { qmat = (vector signed int*)s->q_inter_matrix[qscale]; biasAddr = &(s->inter_quant_bias); } @@ -387,7 +377,7 @@ int dct_quantize_altivec(MpegEncContext* s, { vector bool char zero_01, zero_23, zero_45, zero_67; - vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67; + vector signed char scanIndexes_01, scanIndexes_23, scanIndexes_45, scanIndexes_67; vector signed char negOne = vec_splat_s8(-1); vector signed char* scanPtr = (vector signed char*)(s->intra_scantable.inverse); @@ -404,38 +394,38 @@ int dct_quantize_altivec(MpegEncContext* s, vec_cmpeq(data7, (vector signed short)zero)); // 64 biggest values - scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01); - scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23); - scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45); - scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67); + scanIndexes_01 = vec_sel(scanPtr[0], negOne, zero_01); + scanIndexes_23 = vec_sel(scanPtr[1], negOne, zero_23); + scanIndexes_45 = vec_sel(scanPtr[2], negOne, zero_45); + scanIndexes_67 = vec_sel(scanPtr[3], negOne, zero_67); // 32 largest values - scanIndices_01 = vec_max(scanIndices_01, scanIndices_23); - scanIndices_45 = vec_max(scanIndices_45, scanIndices_67); + scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_23); + scanIndexes_45 = vec_max(scanIndexes_45, scanIndexes_67); // 16 largest values - scanIndices_01 = vec_max(scanIndices_01, scanIndices_45); + scanIndexes_01 = vec_max(scanIndexes_01, scanIndexes_45); // 8 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); + scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), + vec_mergel(scanIndexes_01, negOne)); // 4 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); + scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), + vec_mergel(scanIndexes_01, negOne)); // 2 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); + scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), + vec_mergel(scanIndexes_01, negOne)); // largest value - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); + scanIndexes_01 = vec_max(vec_mergeh(scanIndexes_01, negOne), + vec_mergel(scanIndexes_01, negOne)); - scanIndices_01 = vec_splat(scanIndices_01, 0); + scanIndexes_01 = vec_splat(scanIndexes_01, 0); - vec_ste(scanIndices_01, 0, &lastNonZeroChar); + vec_ste(scanIndexes_01, 0, &lastNonZeroChar); lastNonZero = lastNonZeroChar; @@ -443,8 +433,7 @@ int dct_quantize_altivec(MpegEncContext* s, // and handle it using the vector unit if we can. This is the permute used // by the altivec idct, so it is common when using the altivec dct. - if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) - { + if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) { TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); } @@ -460,10 +449,8 @@ int dct_quantize_altivec(MpegEncContext* s, } // special handling of block[0] - if (s->mb_intra) - { - if (!s->h263_aic) - { + if (s->mb_intra) { + if (!s->h263_aic) { if (n < 4) oldBaseValue /= s->y_dc_scale; else @@ -474,24 +461,20 @@ int dct_quantize_altivec(MpegEncContext* s, data[0] = (oldBaseValue + 4) >> 3; } - // We handled the tranpose permutation above and we don't + // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && - (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) - { + (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) { ff_block_permute(data, s->dsp.idct_permutation, s->intra_scantable.scantable, lastNonZero); } return lastNonZero; } -#undef FOUROF -/* - AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned -*/ +/* AltiVec version of dct_unquantize_h263 + this code assumes `block' is 16 bytes-aligned */ void dct_unquantize_h263_altivec(MpegEncContext *s, DCTELEM *block, int n, int qscale) { @@ -522,82 +505,110 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); } { - register const_vector signed short vczero = (const_vector signed short)vec_splat_s16(0); - DECLARE_ALIGNED_16(short, qmul8[]) = - { - qmul, qmul, qmul, qmul, - qmul, qmul, qmul, qmul - }; - DECLARE_ALIGNED_16(short, qadd8[]) = - { - qadd, qadd, qadd, qadd, - qadd, qadd, qadd, qadd - }; - DECLARE_ALIGNED_16(short, nqadd8[]) = - { - -qadd, -qadd, -qadd, -qadd, - -qadd, -qadd, -qadd, -qadd - }; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; - - qmulv = vec_ld(0, qmul8); - qaddv = vec_ld(0, qadd8); - nqaddv = vec_ld(0, nqadd8); - -#if 0 // block *is* 16 bytes-aligned, it seems. - // first make sure block[j] is 16 bytes-aligned - for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; + register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); + DECLARE_ALIGNED_16(short, qmul8) = qmul; + DECLARE_ALIGNED_16(short, qadd8) = qadd; + register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; + register vector bool short blockv_null, blockv_neg; + register short backup_0 = block[0]; + register int j = 0; + + qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0); + qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0); + nqaddv = vec_sub(vczero, qaddv); + +#if 0 // block *is* 16 bytes-aligned, it seems. + // first make sure block[j] is 16 bytes-aligned + for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { + level = block[j]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[j] = level; } - block[j] = level; } - } #endif - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) - { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } - - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; + // vectorize all the 16 bytes-aligned blocks + // of 8 elements + for(; (j + 7) <= nCoeffs ; j+=8) { + blockv = vec_ld(j << 1, block); + blockv_neg = vec_cmplt(blockv, vczero); + blockv_null = vec_cmpeq(blockv, vczero); + // choose between +qadd or -qadd as the third operand + temp1 = vec_sel(qaddv, nqaddv, blockv_neg); + // multiply & add (block{i,i+7} * qmul [+-] qadd) + temp1 = vec_mladd(blockv, qmulv, temp1); + // put 0 where block[{i,i+7} used to have 0 + blockv = vec_sel(temp1, blockv, blockv_null); + vec_st(blockv, j << 1, block); + } + + // if nCoeffs isn't a multiple of 8, finish the job + // using good old scalar units. + // (we could do it using a truncated vector, + // but I'm not sure it's worth the hassle) + for(; j <= nCoeffs ; j++) { + level = block[j]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[j] = level; } - block[j] = level; } - } - if (i == 1) - { // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } + if (i == 1) { + // cheat. this avoid special-casing the first iteration + block[0] = backup_0; + } } POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } + + +void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); +void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); + +void MPV_common_init_altivec(MpegEncContext *s) +{ + if ((mm_flags & FF_MM_ALTIVEC) == 0) return; + + if (s->avctx->lowres==0) { + if ((s->avctx->idct_algo == FF_IDCT_AUTO) || + (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) { + s->dsp.idct_put = idct_put_altivec; + s->dsp.idct_add = idct_add_altivec; + s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } + } + + // Test to make sure that the dct required alignments are met. + if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || + (((long)(s->q_inter_matrix) & 0x0f) != 0)) { + av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " + "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); + return; + } + + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) { + av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " + "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); + return; + } + + + if ((s->avctx->dct_algo == FF_DCT_AUTO) || + (s->avctx->dct_algo == FF_DCT_ALTIVEC)) { +#if 0 /* seems to cause trouble under some circumstances */ + s->dct_quantize = dct_quantize_altivec; +#endif + s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; + s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; + } +}