Neither GCC nor ARMCC support 16 byte stack alignment despite the fact that NEON loads require it.
These macros only work for arrays, but fortunately that covers almost all instances of stack alignment in x264.
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
- DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
+ ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[460];
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
- DECLARE_ALIGNED_16( uint32_t nr_residual_sum[2][64] );
- DECLARE_ALIGNED_16( uint16_t nr_offset[2][64] );
+ ALIGNED_16( uint32_t nr_residual_sum[2][64] );
+ ALIGNED_16( uint16_t nr_offset[2][64] );
uint32_t nr_count[2];
/* Slice header */
/* Current MB DCT coeffs */
struct
{
- DECLARE_ALIGNED_16( int16_t luma16x16_dc[16] );
- DECLARE_ALIGNED_16( int16_t chroma_dc[2][4] );
+ ALIGNED_16( int16_t luma16x16_dc[16] );
+ ALIGNED_16( int16_t chroma_dc[2][4] );
// FIXME share memory?
- DECLARE_ALIGNED_16( int16_t luma8x8[4][64] );
- DECLARE_ALIGNED_16( int16_t luma4x4[16+8][16] );
+ ALIGNED_16( int16_t luma8x8[4][64] );
+ ALIGNED_16( int16_t luma4x4[16+8][16] );
} dct;
/* MB table and cache for current frame/mb */
/* current value */
int i_type;
int i_partition;
- DECLARE_ALIGNED_4( uint8_t i_sub_partition[4] );
+ ALIGNED_4( uint8_t i_sub_partition[4] );
int b_transform_8x8;
int i_cbp_luma;
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- DECLARE_ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
- DECLARE_ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
+ ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
+ ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
- DECLARE_ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
- DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
- DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
- DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
+ ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
+ ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
+ ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
+ ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
/* Psy trellis DCT data */
- DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
- DECLARE_ALIGNED_16( int16_t fenc_dct4[16][16] );
+ ALIGNED_16( int16_t fenc_dct8[4][64] );
+ ALIGNED_16( int16_t fenc_dct4[16][16] );
/* Psy RD SATD scores */
int fenc_satd[4][4];
uint8_t non_zero_count[X264_SCAN8_SIZE];
/* -1 if unused, -2 if unavailable */
- DECLARE_ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
+ ALIGNED_4( int8_t ref[2][X264_SCAN8_SIZE] );
/* 0 if not available */
- DECLARE_ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
- DECLARE_ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
+ ALIGNED_16( int16_t mv[2][X264_SCAN8_SIZE][2] );
+ ALIGNED_8( int16_t mvd[2][X264_SCAN8_SIZE][2] );
/* 1 if SKIP or DIRECT. set only for B-frames + CABAC */
- DECLARE_ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
+ ALIGNED_4( int8_t skip[X264_SCAN8_SIZE] );
- DECLARE_ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
- DECLARE_ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] );
- DECLARE_ALIGNED_4( int16_t pskip_mv[2] );
+ ALIGNED_16( int16_t direct_mv[2][X264_SCAN8_SIZE][2] );
+ ALIGNED_4( int8_t direct_ref[2][X264_SCAN8_SIZE] );
+ ALIGNED_4( int16_t pskip_mv[2] );
/* number of neighbors (top and left) that used 8x8 dct */
int i_neighbour_transform_size;
{\
int i_edge = (i_dir ? (mb_y <= b_interlaced) : (mb_x == 0));\
int i_qpn, i, mbn_xy, mbn_8x8, mbn_4x4;\
- DECLARE_ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
+ ALIGNED_4( uint8_t bS[4] ); /* filtering strength */\
if( i_edge )\
i_edge+= b_8x8_transform;\
else\
static int x264_mb_predict_mv_direct16x16_spatial( x264_t *h )
{
int ref[2];
- DECLARE_ALIGNED_8( int16_t mv[2][2] );
+ ALIGNED_8( int16_t mv[2][2] );
int i_list;
int i8;
const int8_t *l1ref0 = &h->fref1[0]->ref[0][ h->mb.i_b8_xy ];
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
- DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
- DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, tmp1,[16*16] );
uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
-#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
-#define DECLARE_ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
-#define DECLARE_ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
+#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
+#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
+#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
+
+// current arm compilers only maintain 8-byte stack alignment
+// and cannot align stack variables to more than 8-bytes
+#ifdef ARCH_ARM
+#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
+ ALIGNED_8( uint8_t name##_8 [sizeof(type sub1 __VA_ARGS__) + 8] );\
+ type (*name) __VA_ARGS__ = (void*)(name##_8 + ((intptr_t)name##_8 & 8))
+#else
+#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
+ ALIGNED_16( type name sub1 __VA_ARGS__ )
+#endif
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
#define UNUSED __attribute__((unused))
static inline void write16x4(uint8_t *dst, int dst_stride,
register vec_u8_t r0, register vec_u8_t r1,
register vec_u8_t r2, register vec_u8_t r3) {
- DECLARE_ALIGNED_16(unsigned char result[64]);
+ ALIGNED_16(unsigned char result[64]);
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4;
}
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
- DECLARE_ALIGNED_16(unsigned char temp[16]); \
+ ALIGNED_16(unsigned char temp[16]); \
register vec_u8_t alphavec; \
register vec_u8_t betavec; \
register vec_u8_t mask; \
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
- DECLARE_ALIGNED_16( uint16_t coeff[4] );
+ ALIGNED_16( uint16_t coeff[4] );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
int d8x = mvx & 0x07;
int d8y = mvy & 0x07;
- DECLARE_ALIGNED_16( uint16_t coeff[4] );
+ ALIGNED_16( uint16_t coeff[4] );
coeff[0] = (8-d8x)*(8-d8y);
coeff[1] = d8x *(8-d8y);
coeff[2] = (8-d8x)*d8y;
uint8_t *pix2, int i_pix2 ) \
{ \
int y; \
- DECLARE_ALIGNED_16( int sum ); \
+ ALIGNED_16( int sum ); \
\
LOAD_ZERO; \
PREP_LOAD; \
static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
PREP_LOAD_SRC( pix1 );
static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v;
static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
PREP_DIFF;
vec_s16_t diff0v, diff1v, diff2v, diff3v,
static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
LOAD_ZERO;
PREP_LOAD;
static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1,
uint8_t *pix2, int i_pix2 )
{
- DECLARE_ALIGNED_16( int i_satd );
+ ALIGNED_16( int i_satd );
LOAD_ZERO;
PREP_LOAD;
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
uint8_t *pix2, uint8_t *pix3,
int i_stride, int scores[4] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
- DECLARE_ALIGNED_16( int sum3 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum3 );
int y;
LOAD_ZERO;
uint8_t *pix1, uint8_t *pix2,
int i_stride, int scores[3] )
{
- DECLARE_ALIGNED_16( int sum0 );
- DECLARE_ALIGNED_16( int sum1 );
- DECLARE_ALIGNED_16( int sum2 );
+ ALIGNED_16( int sum0 );
+ ALIGNED_16( int sum1 );
+ ALIGNED_16( int sum2 );
int y;
LOAD_ZERO;
static int pixel_ssd_16x16_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
- DECLARE_ALIGNED_16( int sum );
+ ALIGNED_16( int sum );
int y;
LOAD_ZERO;
static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
uint8_t *pix2, int i_stride_pix2)
{
- DECLARE_ALIGNED_16( int sum );
+ ALIGNED_16( int sum );
int y;
LOAD_ZERO;
****************************************************************************/
static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
{
- DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
- DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
+ ALIGNED_16(uint32_t sum_tab[4]);
+ ALIGNED_16(uint32_t sqr_tab[4]);
LOAD_ZERO;
vec_u32_t sqr_v = zero_u32v;
static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
{
- DECLARE_ALIGNED_16(uint32_t sum_tab[4]);
- DECLARE_ALIGNED_16(uint32_t sqr_tab[4]);
+ ALIGNED_16(uint32_t sum_tab[4]);
+ ALIGNED_16(uint32_t sqr_tab[4]);
LOAD_ZERO;
vec_u32_t sqr_v = zero_u32v;
static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, int stride, const vec_u8_t perm )
{
- DECLARE_ALIGNED_16( int32_t sum4_tab[4] );
- DECLARE_ALIGNED_16( int32_t sum8_tab[4] );
+ ALIGNED_16( int32_t sum4_tab[4] );
+ ALIGNED_16( int32_t sum8_tab[4] );
LOAD_ZERO;
VEC_LOAD_HIGH( pix, 0 );
int sum8 = sum8_tab[3];
- DECLARE_ALIGNED_16( int16_t tmp0_4_tab[8] );
+ ALIGNED_16( int16_t tmp0_4_tab[8] );
vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
sum4 -= tmp0_4_tab[0];
const uint8_t *pix2, int stride2,
int sums[2][4] )
{
- DECLARE_ALIGNED_16( int temp[4] );
+ ALIGNED_16( int temp[4] );
int y;
vec_u8_t pix1v, pix2v;
extern void predict_16x16_v_sse2( uint8_t *src );
extern void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
-DECLARE_ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
-DECLARE_ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
-DECLARE_ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
+ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
+ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
+ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
PREDICT_8x8_LOAD_TOP\
PREDICT_8x8_LOAD_LEFT\
int t;\
- DECLARE_ALIGNED_16( int16_t sa8d_1d[2][8] );\
+ ALIGNED_16( int16_t sa8d_1d[2][8] );\
SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);\
SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);\
SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);\
/* 8x8 */
int i_cost8x8;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
- DECLARE_ALIGNED_4( int16_t mvc[32][5][2] );
+ ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
/* Sub 4x4 */
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
{
- DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
- DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
- DECLARE_ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
+ ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
int i;
if( do_both_dct || h->mb.b_transform_8x8 )
/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
static inline void x264_mb_cache_fenc_satd( x264_t *h )
{
- DECLARE_ALIGNED_16( static uint8_t zero[16] ) = {0};
+ ALIGNED_16( static uint8_t zero[16] ) = {0};
uint8_t *fenc;
int x, y, satd_sum = 0, sa8d_sum = 0;
if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
}
else if( h->mb.i_type == I_8x8 )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
for( idx = 0; idx < 4; idx++ )
{
uint64_t pels_h = 0;
{
x264_me_t m;
int i_ref, i_mvc;
- DECLARE_ALIGNED_4( int16_t mvc[8][2] );
+ ALIGNED_4( int16_t mvc[8][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
{
x264_me_t m;
uint8_t **p_fenc = h->mb.pic.p_fenc;
- DECLARE_ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_4( int16_t mvc[3][2] );
int i, j;
/* XXX Needed for x264_mb_predict_mv */
{
x264_me_t m;
uint8_t **p_fenc = h->mb.pic.p_fenc;
- DECLARE_ALIGNED_4( int16_t mvc[3][2] );
+ ALIGNED_4( int16_t mvc[3][2] );
int i, j;
/* XXX Needed for x264_mb_predict_mv */
static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
{
- DECLARE_ALIGNED_8( uint8_t pix1[16*8] );
+ ALIGNED_8( uint8_t pix1[16*8] );
uint8_t *pix2 = pix1+8;
const int i_stride = h->mb.pic.i_stride[1];
const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
- DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
- DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
uint8_t *src0, *src1;
int stride0 = 16, stride1 = 16;
x264_me_t m;
int i_ref, i_mvc;
- DECLARE_ALIGNED_4( int16_t mvc[9][2] );
+ ALIGNED_4( int16_t mvc[9][2] );
int i_halfpel_thresh = INT_MAX;
int *p_halfpel_thresh = h->mb.pic.i_fref[0]>1 ? &i_halfpel_thresh : NULL;
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_8( uint8_t pix[2][8*8] );
+ ALIGNED_8( uint8_t pix[2][8*8] );
int i, l;
/* XXX Needed for x264_mb_predict_mv */
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
- DECLARE_ALIGNED_4( int16_t mvc[2][2] );
+ ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
+ ALIGNED_4( int16_t mvc[2][2] );
int i, l;
h->mb.i_partition = D_16x8;
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
- DECLARE_ALIGNED_8( uint8_t pix[2][8*16] );
- DECLARE_ALIGNED_4( int16_t mvc[2][2] );
+ ALIGNED_8( uint8_t pix[2][8*16] );
+ ALIGNED_4( int16_t mvc[2][2] );
int i, l;
h->mb.i_partition = D_8x16;
static NOINLINE uint32_t x264_cabac_mb_mvd( x264_t *h, x264_cabac_t *cb, int i_list, int idx, int width )
{
- DECLARE_ALIGNED_4( int16_t mvp[2] );
+ ALIGNED_4( int16_t mvp[2] );
uint32_t amvd;
int mdx, mdy;
static void cavlc_mb_mvd( x264_t *h, bs_t *s, int i_list, int idx, int width )
{
- DECLARE_ALIGNED_4( int16_t mvp[2] );
+ ALIGNED_4( int16_t mvp[2] );
x264_mb_predict_mv( h, i_list, idx, width, mvp );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][0] - mvp[0] );
bs_write_se( s, h->mb.cache.mv[i_list][x264_scan8[idx]][1] - mvp[1] );
int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] );
if( h->mb.b_lossless )
{
int nz;
uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
- DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[8],[8] );
if( h->mb.b_lossless )
{
uint8_t *p_src = h->mb.pic.p_fenc[0];
uint8_t *p_dst = h->mb.pic.p_fdec[0];
- DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct_dc4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[4],[4] );
int i, nz;
int b_decimate = h->sh.i_type == SLICE_TYPE_B || (h->param.analyse.b_dct_decimate && h->sh.i_type == SLICE_TYPE_P);
{
int i, ch, nz, nz_dc;
int b_decimate = b_inter && (h->sh.i_type == SLICE_TYPE_B || h->param.analyse.b_dct_decimate);
- DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
+ ALIGNED_ARRAY_16( int16_t, dct2x2,[2],[2] );
h->mb.i_cbp_chroma = 0;
/* Early termination: check variance of chroma residual before encoding.
int i_decimate_score = 0;
int nz_ac = 0;
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] );
if( h->mb.b_lossless )
{
}
else if( h->mb.i_type == I_8x8 )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
h->mb.b_transform_8x8 = 1;
/* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
if( h->mb.i_skip_intra )
}
else if( h->mb.b_transform_8x8 )
{
- DECLARE_ALIGNED_16( int16_t dct8x8[4][8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[4],[8][8] );
b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[1] += h->mb.b_noise_reduction * 4;
}
else
{
- DECLARE_ALIGNED_16( int16_t dct4x4[16][4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[4][4] );
h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
h->nr_count[0] += h->mb.b_noise_reduction * 16;
*****************************************************************************/
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
{
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
- DECLARE_ALIGNED_16( int16_t dct2x2[2][2] );
- DECLARE_ALIGNED_16( int16_t dctscan[16] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct2x2,[2],[2] );
+ ALIGNED_ARRAY_16( int16_t, dctscan,[16] );
int i_qp = h->mb.i_qp;
int mvp[2];
{
if( h->mb.b_transform_8x8 )
{
- DECLARE_ALIGNED_16( int16_t dct8x8[8][8] );
+ ALIGNED_ARRAY_16( int16_t, dct8x8,[8],[8] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, 0, i8 );
if( nnz8x8 )
{
int i4;
int i_decimate_8x8 = 0;
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4][4] );
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
for( i4 = 0; i4 < 4; i4++ )
{
for( ch = 0; ch < 2; ch++ )
{
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] );
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
}
else
{
- DECLARE_ALIGNED_16( int16_t dct4x4[4][4] );
+ ALIGNED_ARRAY_16( int16_t, dct4x4,[4],[4] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, DCT_LUMA_4x4, 0, i4 );
h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
int omx, omy, pmx, pmy;
uint8_t *p_fenc = m->p_fenc[0];
uint8_t *p_fref = m->p_fref[0];
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
int i, j;
int dir;
uint16_t *sums_base = m->integral;
/* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
* this is not a problem because it is not used for any SSE instructions. */
- DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
- DECLARE_ALIGNED_16( int enc_dc[4] );
+ ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+ ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
- DECLARE_ALIGNED_16( uint8_t pix[2][32*18] ); // really 17x17, but round up for alignment
+ ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] ); // really 17x17, but round up for alignment
int omx, omy;
int i;
const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
- DECLARE_ALIGNED_16( uint8_t pixy_buf[2][9][16*16] );
- DECLARE_ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
- DECLARE_ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
+ ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
+ ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
+ ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
uint8_t *src0[9];
uint8_t *src1[9];
uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
int mc_list0 = 1, mc_list1 = 1;
uint64_t bcostrd = COST_MAX64;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
+ ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
static const int8_t dia4d[32][4] = {
{0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 )
return;
- h->mc.memzero_aligned( visited, sizeof(visited) );
+ h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
BIME_CACHE( 0, 0, 0 );
BIME_CACHE( 0, 0, 1 );
const int bh = x264_pixel_size[m->i_pixel].h>>2;
const int i_pixel = m->i_pixel;
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
int bmx = m->mv[0];
int bmy = m->mv[1];
uint16_t *integral;
int i_stride[2];
- DECLARE_ALIGNED_4( int16_t mvp[2] );
+ ALIGNED_4( int16_t mvp[2] );
/* output */
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
- DECLARE_ALIGNED_4( int16_t mv[2] );
-} DECLARE_ALIGNED_16( x264_me_t );
+ ALIGNED_4( int16_t mv[2] );
+} ALIGNED_16( x264_me_t );
typedef struct {
int sad;
static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
{
- DECLARE_ALIGNED_16(static uint8_t zero[16]);
+ ALIGNED_16(static uint8_t zero[16]);
int satd = 0;
uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
int16_t (*fenc_mvs[2])[2] = { &frames[b]->lowres_mvs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mvs[1][p1-b-1][i_mb_xy] };
int (*fenc_costs[2]) = { &frames[b]->lowres_mv_costs[0][b-p0-1][i_mb_xy], &frames[b]->lowres_mv_costs[1][p1-b-1][i_mb_xy] };
- DECLARE_ALIGNED_8( uint8_t pix1[9*FDEC_STRIDE] );
+ ALIGNED_8( uint8_t pix1[9*FDEC_STRIDE] );
uint8_t *pix2 = pix1+8;
x264_me_t m[2];
int i_bcost = COST_MAX;
{
int i_mvc = 0;
int16_t (*fenc_mv)[2] = fenc_mvs[l];
- DECLARE_ALIGNED_4( int16_t mvc[4][2] );
+ ALIGNED_4( int16_t mvc[4][2] );
/* Reverse-order MV prediction. */
*(uint32_t*)mvc[0] = 0;
int i_icost, b_intra;
if( !fenc->b_intra_calculated )
{
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_ARRAY_16( uint8_t, edge,[33] );
uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
const int intra_penalty = 5;
// GCC doesn't align stack variables on ARM, so use .bss
#ifdef ARCH_ARM
-#undef DECLARE_ALIGNED_16
-#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
+#undef ALIGNED_16
+#define ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
#endif
/* buf1, buf2: initialised to random data and shouldn't write into them */
x264_predict_t predict_4x4[9+3];
x264_predict8x8_t predict_8x8[9+3];
x264_predict_8x8_filter_t predict_8x8_filter;
- DECLARE_ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_16( uint8_t edge[33] );
uint16_t cost_mv[32];
int ret = 0, ok, used_asm;
int i, j;
pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
{
float res_c, res_a;
- DECLARE_ALIGNED_16( int sums[5][4] ) = {{0}};
+ ALIGNED_16( int sums[5][4] ) = {{0}};
used_asm = ok = 1;
x264_emms();
res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
for( i=0; i<100 && ok; i++ )
if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
- DECLARE_ALIGNED_16( uint16_t sums[72] );
- DECLARE_ALIGNED_16( int dc[4] );
+ ALIGNED_16( uint16_t sums[72] );
+ ALIGNED_16( int dc[4] );
int16_t mvs_a[32], mvs_c[32];
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, i, j, interlace;
- DECLARE_ALIGNED_16( int16_t dct1[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
- DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
- DECLARE_ALIGNED_8( int16_t dctdc[2][2][2] );
+ ALIGNED_16( int16_t dct1[16][4][4] );
+ ALIGNED_16( int16_t dct2[16][4][4] );
+ ALIGNED_16( int16_t dct4[16][4][4] );
+ ALIGNED_16( int16_t dct8[4][8][8] );
+ ALIGNED_8( int16_t dctdc[2][2][2] );
x264_t h_buf;
x264_t *h = &h_buf;
x264_zigzag_function_t zigzag_ref;
x264_zigzag_function_t zigzag_asm;
- DECLARE_ALIGNED_16( int16_t level1[64] );
- DECLARE_ALIGNED_16( int16_t level2[64] );
+ ALIGNED_16( int16_t level1[64] );
+ ALIGNED_16( int16_t level2[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
- DECLARE_ALIGNED_16( int16_t dct1[64] );
- DECLARE_ALIGNED_16( int16_t dct2[64] );
- DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
+ ALIGNED_16( int16_t dct1[64] );
+ ALIGNED_16( int16_t dct2[64] );
+ ALIGNED_16( uint8_t cqm_buf[64] );
int ret = 0, ok, used_asm;
int oks[2] = {1,1}, used_asms[2] = {0,0};
int i, j, i_cqm, qp;
{
int ret = 0, ok = 1, used_asm = 0;
int i;
- DECLARE_ALIGNED_16( uint8_t edge[33] );
- DECLARE_ALIGNED_16( uint8_t edge2[33] );
+ ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_16( uint8_t edge2[33] );
struct
{
x264_predict_t predict_16x16[4+3];