- int sum;
- register vector signed short
- temp0 asm ("v0"),
- temp1 asm ("v1"),
- temp2 asm ("v2"),
- temp3 asm ("v3"),
- temp4 asm ("v4"),
- temp5 asm ("v5"),
- temp6 asm ("v6"),
- temp7 asm ("v7");
- register vector signed short
- temp0S asm ("v8"),
- temp1S asm ("v9"),
- temp2S asm ("v10"),
- temp3S asm ("v11"),
- temp4S asm ("v12"),
- temp5S asm ("v13"),
- temp6S asm ("v14"),
- temp7S asm ("v15");
- register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
- {
- register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
- register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
- register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
- register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
- AVV(0x02, 0x03, 0x00, 0x01,
- 0x06, 0x07, 0x04, 0x05,
- 0x0A, 0x0B, 0x08, 0x09,
- 0x0E, 0x0F, 0x0C, 0x0D);
- register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
- AVV(0x04, 0x05, 0x06, 0x07,
- 0x00, 0x01, 0x02, 0x03,
- 0x0C, 0x0D, 0x0E, 0x0F,
- 0x08, 0x09, 0x0A, 0x0B);
- register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
- AVV(0x08, 0x09, 0x0A, 0x0B,
- 0x0C, 0x0D, 0x0E, 0x0F,
- 0x00, 0x01, 0x02, 0x03,
- 0x04, 0x05, 0x06, 0x07);
-
-#define ONEITERBUTTERFLY(i, res1, res2) \
- { \
- register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
- register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
- src1 = vec_ld(stride * i, src); \
- src2 = vec_ld((stride * i) + 16, src); \
- register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
- dst1 = vec_ld(stride * i, dst); \
- dst2 = vec_ld((stride * i) + 16, dst); \
- register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
- /* promote the unsigned chars to signed shorts */ \
- register vector signed short srcV asm ("v24") = \
- (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
- register vector signed short dstV asm ("v25") = \
- (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
- register vector signed short srcW asm ("v26") = \
- (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
- register vector signed short dstW asm ("v27") = \
- (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
- /* substractions inside the first butterfly */ \
- register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
- register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
- register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
- register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
- register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
- register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
- register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
- register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
- register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
- register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
- register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
- res1 = vec_mladd(but2, vprod3, op3); \
- register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
- res2 = vec_mladd(but2S, vprod3, op3S); \
+ int sum;
+ register vector signed short
+ temp0 __asm__ ("v0"),
+ temp1 __asm__ ("v1"),
+ temp2 __asm__ ("v2"),
+ temp3 __asm__ ("v3"),
+ temp4 __asm__ ("v4"),
+ temp5 __asm__ ("v5"),
+ temp6 __asm__ ("v6"),
+ temp7 __asm__ ("v7");
+ register vector signed short
+ temp0S __asm__ ("v8"),
+ temp1S __asm__ ("v9"),
+ temp2S __asm__ ("v10"),
+ temp3S __asm__ ("v11"),
+ temp4S __asm__ ("v12"),
+ temp5S __asm__ ("v13"),
+ temp6S __asm__ ("v14"),
+ temp7S __asm__ ("v15");
+ register const vector unsigned char vzero __asm__ ("v31") =
+ (const vector unsigned char)vec_splat_u8(0);
+ {
+ register const vector signed short vprod1 __asm__ ("v16") =
+ (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
+ register const vector signed short vprod2 __asm__ ("v17") =
+ (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
+ register const vector signed short vprod3 __asm__ ("v18") =
+ (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
+ register const vector unsigned char perm1 __asm__ ("v19") =
+ (const vector unsigned char)
+ {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
+ 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
+ register const vector unsigned char perm2 __asm__ ("v20") =
+ (const vector unsigned char)
+ {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
+ 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
+ register const vector unsigned char perm3 __asm__ ("v21") =
+ (const vector unsigned char)
+ {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
+
+#define ONEITERBUTTERFLY(i, res1, res2) \
+ { \
+ register vector unsigned char src1 __asm__ ("v22"), \
+ src2 __asm__ ("v23"), \
+ dst1 __asm__ ("v24"), \
+ dst2 __asm__ ("v25"), \
+ srcO __asm__ ("v22"), \
+ dstO __asm__ ("v23"); \
+ \
+ register vector signed short srcV __asm__ ("v24"), \
+ dstV __asm__ ("v25"), \
+ srcW __asm__ ("v26"), \
+ dstW __asm__ ("v27"), \
+ but0 __asm__ ("v28"), \
+ but0S __asm__ ("v29"), \
+ op1 __asm__ ("v30"), \
+ but1 __asm__ ("v22"), \
+ op1S __asm__ ("v23"), \
+ but1S __asm__ ("v24"), \
+ op2 __asm__ ("v25"), \
+ but2 __asm__ ("v26"), \
+ op2S __asm__ ("v27"), \
+ but2S __asm__ ("v28"), \
+ op3 __asm__ ("v29"), \
+ op3S __asm__ ("v30"); \
+ \
+ src1 = vec_ld(stride * i, src); \
+ src2 = vec_ld((stride * i) + 16, src); \
+ srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
+ dst1 = vec_ld(stride * i, dst); \
+ dst2 = vec_ld((stride * i) + 16, dst); \
+ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
+ /* promote the unsigned chars to signed shorts */ \
+ srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
+ (vector signed char)srcO); \
+ dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
+ (vector signed char)dstO); \
+ srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
+ (vector signed char)srcO); \
+ dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
+ (vector signed char)dstO); \
+ /* subtractions inside the first butterfly */ \
+ but0 = vec_sub(srcV, dstV); \
+ but0S = vec_sub(srcW, dstW); \
+ op1 = vec_perm(but0, but0, perm1); \
+ but1 = vec_mladd(but0, vprod1, op1); \
+ op1S = vec_perm(but0S, but0S, perm1); \
+ but1S = vec_mladd(but0S, vprod1, op1S); \
+ op2 = vec_perm(but1, but1, perm2); \
+ but2 = vec_mladd(but1, vprod2, op2); \
+ op2S = vec_perm(but1S, but1S, perm2); \
+ but2S = vec_mladd(but1S, vprod2, op2S); \
+ op3 = vec_perm(but2, but2, perm3); \
+ res1 = vec_mladd(but2, vprod3, op3); \
+ op3S = vec_perm(but2S, but2S, perm3); \
+ res2 = vec_mladd(but2S, vprod3, op3S); \