]> git.sesse.net Git - x264/commitdiff
ppc: Add little-endian PowerPC support
authorRong Yan <rongyan236@foxmail.com>
Mon, 20 Jul 2015 08:34:20 +0000 (03:34 -0500)
committerHenrik Gramner <henrik@gramner.com>
Sat, 25 Jul 2015 20:52:55 +0000 (22:52 +0200)
common/ppc/dct.c
common/ppc/mc.c
common/ppc/ppccommon.h
common/ppc/quant.c
config.guess
configure

index 6e17dcb0a3652bf1a7ed5b905ecb362bcc8d2138..61bcbb7586d8a6267c5ef10568cd9a87ad701bd4 100644 (file)
@@ -264,7 +264,7 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix
     vec_u8_t lv = vec_ld(0, dest);                              \
     vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                  \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);    \
+    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                    \
     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);   \
     vec_u8_t idstsum8 = vec_s16_to_u8(idstsum);                 \
     /* unaligned store */                                       \
@@ -384,7 +384,7 @@ void x264_add16x16_idct_altivec( uint8_t *p_dst, int16_t dct[16][16] )
     vec_u8_t lv = vec_ld( 7, dest );                           \
     vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
+    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                   \
     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
     vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
     /* unaligned store */                                      \
index 0dfedda1b9f8c7ff148f1747011aa478aa2d1336..73c88a4cf814d779fa2f70889a71ac8b5e8f792e 100644 (file)
@@ -291,6 +291,12 @@ static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
     }
  }
 
+#ifdef WORDS_BIGENDIAN
+#define VSLD(a,b,n) vec_sld(a,b,n)
+#else
+#define VSLD(a,b,n) vec_sld(b,a,16-n)
+#endif
+
 static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
                                    uint8_t *src, intptr_t i_src_stride,
                                    int mvx, int mvy, int i_height )
@@ -316,8 +322,13 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
     vec_u16_t   shiftv, k32v;
 
+#ifdef WORDS_BIGENDIAN
     static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
     static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
+#else
+    static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
+    static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
+#endif
 
     coeff0v = vec_ld( 0, coeff );
     coeff3v = vec_splat( coeff0v, 3 );
@@ -329,7 +340,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
 
     VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
     src2v_16 = vec_u8_to_u16( src2v_8 );
-    src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+    src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
     for( int y = 0; y < i_height; y += 2 )
     {
@@ -337,7 +348,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
         src1v_16 = src3v_16;
         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
         src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
@@ -359,7 +370,7 @@ static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
         src1v_16 = src3v_16;
         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
         src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
@@ -415,12 +426,17 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
     shiftv  = vec_splat_u16( 6 );
 
+#ifdef WORDS_BIGENDIAN
     static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
     static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
+#else
+    static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
+    static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
+#endif
 
     VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
     VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
-    src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+    src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
     for( int y = 0; y < i_height; y += 2 )
     {
@@ -429,7 +445,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
         VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
         VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
 
-        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
         src0v_16h = vec_u8_to_u16_h( src0v_8 );
         src0v_16l = vec_u8_to_u16_l( src0v_8 );
@@ -467,7 +483,7 @@ static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_
         VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
         VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
 
-        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
         src0v_16h = vec_u8_to_u16_h( src0v_8 );
         src0v_16l = vec_u8_to_u16_l( src0v_8 );
@@ -550,11 +566,11 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
     VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
     VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
                                                              \
-    src2v = vec_sld( src1v, src6v,  1 );                     \
-    src3v = vec_sld( src1v, src6v,  2 );                     \
-    src4v = vec_sld( src1v, src6v,  3 );                     \
-    src5v = vec_sld( src1v, src6v,  4 );                     \
-    src6v = vec_sld( src1v, src6v,  5 );                     \
+    src2v = VSLD( src1v, src6v,  1 );                        \
+    src3v = VSLD( src1v, src6v,  2 );                        \
+    src4v = VSLD( src1v, src6v,  3 );                        \
+    src5v = VSLD( src1v, src6v,  4 );                        \
+    src6v = VSLD( src1v, src6v,  5 );                        \
                                                              \
     temp1v = vec_u8_to_s16_h( src1v );                       \
     temp2v = vec_u8_to_s16_h( src2v );                       \
@@ -629,12 +645,12 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
 
 #define HPEL_FILTER_CENTRAL()                           \
 {                                                       \
-    temp1v = vec_sld( tempav, tempbv, 12 );             \
-    temp2v = vec_sld( tempav, tempbv, 14 );             \
+    temp1v = VSLD( tempav, tempbv, 12 );                \
+    temp2v = VSLD( tempav, tempbv, 14 );                \
     temp3v = tempbv;                                    \
-    temp4v = vec_sld( tempbv, tempcv,  2 );             \
-    temp5v = vec_sld( tempbv, tempcv,  4 );             \
-    temp6v = vec_sld( tempbv, tempcv,  6 );             \
+    temp4v = VSLD( tempbv, tempcv,  2 );                \
+    temp5v = VSLD( tempbv, tempcv,  4 );                \
+    temp6v = VSLD( tempbv, tempcv,  6 );                \
                                                         \
     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                    temp4v, temp5v, temp6v );            \
@@ -642,12 +658,12 @@ static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stri
     dest1v = vec_add( temp1v, thirtytwov );             \
     dest1v = vec_sra( dest1v, sixv );                   \
                                                         \
-    temp1v = vec_sld( tempbv, tempcv, 12 );             \
-    temp2v = vec_sld( tempbv, tempcv, 14 );             \
+    temp1v = VSLD( tempbv, tempcv, 12 );                \
+    temp2v = VSLD( tempbv, tempcv, 14 );                \
     temp3v = tempcv;                                    \
-    temp4v = vec_sld( tempcv, tempdv,  2 );             \
-    temp5v = vec_sld( tempcv, tempdv,  4 );             \
-    temp6v = vec_sld( tempcv, tempdv,  6 );             \
+    temp4v = VSLD( tempcv, tempdv,  2 );                \
+    temp5v = VSLD( tempcv, tempdv,  4 );                \
+    temp6v = VSLD( tempcv, tempdv,  6 );                \
                                                         \
     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                    temp4v, temp5v, temp6v );            \
@@ -764,6 +780,9 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
     vec_u8_t lv, hv, src1p1v;
     vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
     static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
+#ifndef WORDS_BIGENDIAN
+    static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
+#endif
 
     for( int y = 0; y < height; y++ )
     {
@@ -788,11 +807,15 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
             src1p1v = vec_ld(16*(x*2+2), src1);
             avghp1v = vec_avg(lv, src1p1v);
 
-            avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
-            avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);
+            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
+            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
 
             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
+#ifdef WORDS_BIGENDIAN
             vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
+#else
+            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
+#endif
 
             avg0v = avghp1v;
 
@@ -802,11 +825,15 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
             hv = vec_ld(16*(x*2+2), src2);
             avghp1v = vec_avg(src1p1v, hv);
 
-            avgleftv = vec_avg(vec_sld(avg1v, avghv, 1), avg1v);
-            avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);
+            avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
+            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
 
             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
+#ifdef WORDS_BIGENDIAN
             vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
+#else
+            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
+#endif
 
             avg1v = avghp1v;
 
@@ -820,11 +847,15 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
             lv = vec_ld(16*(x*2+1), src2);
             avghp1v = vec_avg(src1v, lv);
 
-            avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
-            avgrightv = vec_avg(vec_sld(avg1v, avghp1v, 1), avg1v);
+            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
+            avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
 
             lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
+#ifdef WORDS_BIGENDIAN
             hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
+#else
+            hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
+#endif
 
             vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
             vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
index 67d4fd72ea354e244831d94c9285680ec5230358..420842886b422b315ff6cce5a7a9f6a368a805c7 100644 (file)
@@ -81,10 +81,17 @@ typedef union {
 /***********************************************************************
  * 8 <-> 16 bits conversions
  **********************************************************************/
+#ifdef WORDS_BIGENDIAN
 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
 
 #define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
@@ -96,10 +103,17 @@ typedef union {
 /***********************************************************************
  * 16 <-> 32 bits conversions
  **********************************************************************/
+#ifdef WORDS_BIGENDIAN
 #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#else
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#endif
 
 #define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
 #define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
index 9382b5b4e87717f979104bb25872b7f3c18f5ef6..51654e0c91ae3c72da119ca95320cc5236f45cb2 100644 (file)
@@ -251,6 +251,14 @@ int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64]
     vec_st(dctv, 8*y, dct);                                          \
 }
 
+#ifdef WORDS_BIGENDIAN
+#define VEC_MULE vec_mule
+#define VEC_MULO vec_mulo
+#else
+#define VEC_MULE vec_mulo
+#define VEC_MULO vec_mule
+#endif
+
 #define DEQUANT_SHR()                                          \
 {                                                              \
     dctv = vec_ld(8*y, dct);                                   \
@@ -259,14 +267,14 @@ int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64]
     mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
     mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
                                                                \
-    multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
-    multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
+    multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v);             \
+    multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v);              \
     temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
     temp1v = vec_add(temp1v, fv);                              \
     temp1v = vec_sra(temp1v, i_qbitsv);                        \
                                                                \
-    multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
-    multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
+    multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v);             \
+    multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v);              \
     temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
     temp2v = vec_add(temp2v, fv);                              \
     temp2v = vec_sra(temp2v, i_qbitsv);                        \
index 872b96a161eecdf418668d0c9dc2ca84a34b6642..375fa2a818f84ff8727cec0f516b9c8160ff5749 100755 (executable)
@@ -979,6 +979,9 @@ EOF
     ppc64:Linux:*:*)
        echo powerpc64-unknown-linux-gnu
        exit ;;
+    ppc64le:Linux:*:*)
+       echo powerpc64le-unknown-linux-gnu
+       exit ;;
     ppc:Linux:*:*)
        echo powerpc-unknown-linux-gnu
        exit ;;
index d82c44906d3ed48d52d5c0363340b6e2b10b6f81..85ab3f28a023722781c20a1eb0d9d95d6898e666 100755 (executable)
--- a/configure
+++ b/configure
@@ -711,7 +711,7 @@ case $host_cpu in
             ASFLAGS="$ASFLAGS -f elf64"
         fi
         ;;
-    powerpc|powerpc64)
+    powerpc*)
         ARCH="PPC"
         if [ $asm = auto ] ; then
             define HAVE_ALTIVEC
@@ -881,11 +881,13 @@ define STACK_ALIGNMENT $stack_alignment
 ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment"
 
 # skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well
+CPU_ENDIAN="little-endian"
 if [ $compiler = GNU ]; then
     echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
     $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed"
     if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then
         define WORDS_BIGENDIAN
+        CPU_ENDIAN="big-endian"
     elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then
         die "endian test failed"
     fi
@@ -1403,6 +1405,7 @@ gpl_filters=""
 
 cat > conftest.log <<EOF
 platform:      $ARCH
+byte order:    $CPU_ENDIAN
 system:        $SYS
 cli:           $cli
 libx264:       $cli_libx264