Necessary for future high bit-depth support.
Various macros and extra types have been introduced to make operations on variable-size pixels more convenient.
#define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src)
+typedef uint8_t pixel;
+typedef uint32_t pixel4;
+
+#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
+#define MPIXEL_X4(src) M32(src)
+#define CPPIXEL_X4(dst,src) CP32(dst,src)
+#define CPPIXEL_X8(dst,src) CP64(dst,src)
+
#define X264_SCAN8_SIZE (6*8)
#define X264_SCAN8_LUMA_SIZE (5*8)
#define X264_SCAN8_0 (4+1*8)
void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_init_vlc_tables();
-static ALWAYS_INLINE uint8_t x264_clip_uint8( int x )
+static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{
return x&(~255) ? (-x)>>31 : x;
}
* NOTE: this will fail on resolutions above 2^16 MBs... */
/* buffer for weighted versions of the reference frames */
- uint8_t *p_weight_buf[16];
+ pixel *p_weight_buf[16];
/* current value */
int i_type;
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
- ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
+ ALIGNED_16( pixel fenc_buf[24*FENC_STRIDE] );
+ ALIGNED_16( pixel fdec_buf[27*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
- ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
- ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
+ ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
+ ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
ALIGNED_16( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
- uint8_t *p_fenc[3];
+ pixel *p_fenc[3];
/* pointer to the actual source frame, not a block copy */
- uint8_t *p_fenc_plane[3];
+ pixel *p_fenc_plane[3];
/* pointer over mb of the frame to be reconstructed */
- uint8_t *p_fdec[3];
+ pixel *p_fdec[3];
/* pointer over mb of the references */
int i_fref[2];
- uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
- uint8_t *p_fref_w[32]; /* weighted fullpel luma */
+ pixel *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
+ pixel *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16];
/* fref stride */
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
- uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+ pixel *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
uint8_t (*deblock_strength[2])[2][4][4];
/* CPU functions dependents */
}
static inline void pixel_sub_wxh( int16_t *diff, int i_size,
- uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+ pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
for( int y = 0; y < i_size; y++ )
{
}
}
-static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
+static void sub4x4_dct( int16_t dct[16], pixel *pix1, pixel *pix2 )
{
int16_t d[16];
int16_t tmp[16];
}
}
-static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct( int16_t dct[4][16], pixel *pix1, pixel *pix2 )
{
sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
-static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
+static void sub16x16_dct( int16_t dct[16][16], pixel *pix1, pixel *pix2 )
{
sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
-static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
+static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
{
int16_t d[16];
int sum = 0;
return sum;
}
-static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct_dc( int16_t dct[4], pixel *pix1, pixel *pix2 )
{
dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
dct[3] = d2 - d3;
}
-static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
+static void add4x4_idct( pixel *p_dst, int16_t dct[16] )
{
int16_t d[16];
int16_t tmp[16];
for( int y = 0; y < 4; y++ )
{
for( int x = 0; x < 4; x++ )
- p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
+ p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
p_dst += FDEC_STRIDE;
}
}
-static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
+static void add8x8_idct( pixel *p_dst, int16_t dct[4][16] )
{
add4x4_idct( &p_dst[0], dct[0] );
add4x4_idct( &p_dst[4], dct[1] );
add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
-static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
+static void add16x16_idct( pixel *p_dst, int16_t dct[16][16] )
{
add8x8_idct( &p_dst[0], &dct[0] );
add8x8_idct( &p_dst[8], &dct[4] );
DST(7) = (a4>>2) - a7 ;\
}
-static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
+static void sub8x8_dct8( int16_t dct[64], pixel *pix1, pixel *pix2 )
{
int16_t tmp[64];
#undef DST
}
-static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
+static void sub16x16_dct8( int16_t dct[4][64], pixel *pix1, pixel *pix2 )
{
sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
DST(7, b0 - b7);\
}
-static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
+static void add8x8_idct8( pixel *dst, int16_t dct[64] )
{
dct[0] += 32; // rounding for the >>6 at the end
#undef DST
#define SRC(x) dct[i*8+x]
-#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
+#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
for( int i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
}
-static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
+static void add16x16_idct8( pixel *dst, int16_t dct[4][64] )
{
add8x8_idct8( &dst[0], dct[0] );
add8x8_idct8( &dst[8], dct[1] );
add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}
-static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
+static void inline add4x4_idct_dc( pixel *p_dst, int16_t dc )
{
dc = (dc + 32) >> 6;
for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
{
- p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
- p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
- p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
- p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
+ p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
+ p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
+ p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
+ p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
}
}
-static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
+static void add8x8_idct_dc( pixel *p_dst, int16_t dct[4] )
{
add4x4_idct_dc( &p_dst[0], dct[0] );
add4x4_idct_dc( &p_dst[4], dct[1] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
-static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
+static void add16x16_idct_dc( pixel *p_dst, int16_t dct[16] )
{
for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
{
nz |= level[i];\
}
#define COPY4x4\
- CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
- CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
- CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
- CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
+ CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+ CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+ CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+ CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define COPY8x8\
- CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
- CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
- CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
- CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
- CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
- CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
- CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
- CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
+ CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
+ CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
-static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_4x4_frame( int16_t level[16], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG4_FRAME
return !!nz;
}
-static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_4x4_field( int16_t level[16], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG4_FIELD
level[0] = 0;\
}
-static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
+static int zigzag_sub_4x4ac_frame( int16_t level[16], const pixel *p_src, pixel *p_dst, int16_t *dc )
{
int nz = 0;
ZIGZAG4_FRAME
return !!nz;
}
-static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
+static int zigzag_sub_4x4ac_field( int16_t level[16], const pixel *p_src, pixel *p_dst, int16_t *dc )
{
int nz = 0;
ZIGZAG4_FIELD
return !!nz;
}
-static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_8x8_frame( int16_t level[64], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG8_FRAME
COPY8x8
return !!nz;
}
-static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
+static int zigzag_sub_8x8_field( int16_t level[64], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG8_FIELD
// pix1 stride = FENC_STRIDE
// pix2 stride = FDEC_STRIDE
// p_dst stride = FDEC_STRIDE
- void (*sub4x4_dct) ( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
- void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[16] );
+ void (*sub4x4_dct) ( int16_t dct[16], pixel *pix1, pixel *pix2 );
+ void (*add4x4_idct) ( pixel *p_dst, int16_t dct[16] );
- void (*sub8x8_dct) ( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
- void (*sub8x8_dct_dc)( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
- void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][16] );
- void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[4] );
+ void (*sub8x8_dct) ( int16_t dct[4][16], pixel *pix1, pixel *pix2 );
+ void (*sub8x8_dct_dc)( int16_t dct[4], pixel *pix1, pixel *pix2 );
+ void (*add8x8_idct) ( pixel *p_dst, int16_t dct[4][16] );
+ void (*add8x8_idct_dc) ( pixel *p_dst, int16_t dct[4] );
- void (*sub16x16_dct) ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
- void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][16] );
- void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[16] );
+ void (*sub16x16_dct) ( int16_t dct[16][16], pixel *pix1, pixel *pix2 );
+ void (*add16x16_idct)( pixel *p_dst, int16_t dct[16][16] );
+ void (*add16x16_idct_dc) ( pixel *p_dst, int16_t dct[16] );
- void (*sub8x8_dct8) ( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
- void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[64] );
+ void (*sub8x8_dct8) ( int16_t dct[64], pixel *pix1, pixel *pix2 );
+ void (*add8x8_idct8) ( pixel *p_dst, int16_t dct[64] );
- void (*sub16x16_dct8) ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
- void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][64] );
+ void (*sub16x16_dct8) ( int16_t dct[4][64], pixel *pix1, pixel *pix2 );
+ void (*add16x16_idct8)( pixel *p_dst, int16_t dct[4][64] );
void (*dct4x4dc) ( int16_t d[16] );
void (*idct4x4dc)( int16_t d[16] );
{
void (*scan_8x8)( int16_t level[64], int16_t dct[64] );
void (*scan_4x4)( int16_t level[16], int16_t dct[16] );
- int (*sub_8x8) ( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
- int (*sub_4x4) ( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
- int (*sub_4x4ac)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc );
+ int (*sub_8x8) ( int16_t level[64], const pixel *p_src, pixel *p_dst );
+ int (*sub_4x4) ( int16_t level[16], const pixel *p_src, pixel *p_dst );
+ int (*sub_4x4ac)( int16_t level[16], const pixel *p_src, pixel *p_dst, int16_t *dc );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t;
#define tc0_table(x) i_tc0_table[(x)+12]
/* From ffmpeg */
-static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
}
delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
- pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
+ pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
+ pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
pix += ystride;
}
}
}
-static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
-static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
}
-static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
{
int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
- pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
- pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
+ pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
+ pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
pix += ystride;
}
}
}
-static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
}
-static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
}
-static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
+static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
{
for( int d = 0; d < 16; d++ )
{
pix += ystride;
}
}
-static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
-static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
-static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
+static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
{
for( int d = 0; d < 8; d++ )
{
pix += ystride;
}
}
-static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
}
-static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
}
}
}
-static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
+static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + h->sh.i_alpha_c0_offset;
int alpha = alpha_table(index_a);
pf_inter( pix2, i_stride, alpha, beta, tc );
}
-static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
+static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
{
int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
int beta = beta_table(i_qp + h->sh.i_beta_offset);
int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&b_interlaced][mb_x];
- uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
- uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
- uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
+ pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
+ pixel *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
+ pixel *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
if( mb_y & b_interlaced )
{
pixy -= 15*stridey;
chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
for( int i = 1; i < 3; i++ )
{
- CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
+ CHECKED_MALLOC( frame->buffer[i], chroma_plane_size * sizeof(pixel) );
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
* requires them to be in-phase wrt cacheline alignment. */
if( h->param.analyse.i_subpel_refine && b_fdec )
{
- CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
+ CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size * sizeof(pixel) );
for( int i = 0; i < 4; i++ )
frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
frame->plane[0] = frame->filtered[0];
}
else
{
- CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
+ CHECKED_MALLOC( frame->buffer[0], luma_plane_size * sizeof(pixel) );
frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
}
luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
- CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
+ CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
for( int i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
return 0;
}
+static void ALWAYS_INLINE pixel_memset( pixel *dst, int value, int size )
+{
+ for( int i = 0; i < size; i++ )
+ dst[i] = value;
+}
-
-static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
+static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
for( int y = 0; y < i_height; y++ )
{
/* left band */
- memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
+ pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
/* right band */
- memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
+ pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
}
/* upper band */
if( b_pad_top )
for( int y = 0; y < i_padv; y++ )
- memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
+ memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * sizeof(pixel) );
/* lower band */
if( b_pad_bottom )
for( int y = 0; y < i_padv; y++ )
- memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
+ memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * sizeof(pixel) );
#undef PPIXEL
}
int padh = PADH >> !!i;
int padv = PADV >> !!i;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
- uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
+ pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
if( b_end && !b_start )
height += 4 >> (!!i + h->sh.b_mbaff);
if( h->sh.b_mbaff )
for( int i = 1; i < 4; i++ )
{
// buffer: 8 luma, to match the hpel filter
- uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
+ pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
if( h->sh.b_mbaff )
{
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
if( i_padx )
{
for( int y = 0; y < i_height; y++ )
- memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
- frame->plane[i][y*frame->i_stride[i] + i_width - 1],
- i_padx );
+ {
+ pixel value = frame->plane[i][y*frame->i_stride[i] + i_width - 1];
+ pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], value, i_padx );
+ }
}
if( i_pady )
{
for( int y = i_height; y < i_height + i_pady; y++ )
memcpy( &frame->plane[i][y*frame->i_stride[i]],
&frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)*frame->i_stride[i]],
- i_width + i_padx );
+ (i_width + i_padx) * sizeof(pixel) );
}
}
}
} while( !b_ok );
}
-void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w )
{
/* Weight horizontal strips of height 16. This was found to be the optimal height
int i_stride_lowres;
int i_width_lowres;
int i_lines_lowres;
- uint8_t *plane[3];
- uint8_t *filtered[4]; /* plane[0], H, V, HV */
- uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
+ pixel *plane[3];
+ pixel *filtered[4]; /* plane[0], H, V, HV */
+ pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
uint16_t *integral;
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
- uint8_t *buffer[4];
- uint8_t *buffer_lowres[4];
+ pixel *buffer[4];
+ pixel *buffer_lowres[4];
x264_weight_t weight[16][3]; /* [ref_index][plane] */
- uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
+ pixel *weighted[16]; /* plane[0] weighted of the reference frames */
int b_duplicate;
struct x264_frame *orig;
x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
} x264_synch_frame_list_t;
-typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
+typedef void (*x264_deblock_inter_t)( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
+typedef void (*x264_deblock_intra_t)( pixel *pix, int stride, int alpha, int beta );
typedef struct
{
x264_deblock_inter_t deblock_luma[2];
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
-void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
+void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_sort( x264_frame_t **list, int b_dts );
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
- ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
- ALIGNED_ARRAY_16( uint8_t, tmp1,[16*16] );
- uint8_t *src0, *src1;
+ ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
+ ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
+ pixel *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
mvx0, mvy0, 4*width, 4*height, weight_none );
}
for( int i = 0; i < numweightbuf; i++ )
- CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
+ CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
#undef ALIGN
}
for( int j = 0; j < 3; j++ )
{
/* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
- CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
+ CHECKED_MALLOCZERO( h->intra_border_backup[i][j], ((h->sps->i_mb_width*16+32)>>!!j) * sizeof(pixel) );
h->intra_border_backup[i][j] += 8;
}
CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->sps->i_mb_width );
fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
}
-static NOINLINE void copy_column8( uint8_t *dst, uint8_t *src )
+static NOINLINE void copy_column8( pixel *dst, pixel *src )
{
// input pointers are offset by 4 rows because that's faster (smaller instruction size on x86)
for( int i = -4; i < 4; i++ )
const int i_pix_offset = h->mb.b_interlaced
? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
: w * (mb_x + mb_y * i_stride);
- const uint8_t *plane_fdec = &h->fdec->plane[i][i_pix_offset];
- const uint8_t *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
+ const pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ const pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
if( h->mb.b_interlaced )
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
h->mb.pic.p_fenc_plane[i], i_stride2, w );
- memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, w*3/2+1 );
+ memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, (w*3/2+1) * sizeof(pixel) );
if( h->mb.b_interlaced )
for( int j = 0; j < w; j++ )
h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
#endif
}
+#define pack_pixel_1to2 pack8to16
+#define pack_pixel_2to4 pack16to32
+
#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
#define array_non_zero_int array_non_zero_int
static ALWAYS_INLINE int array_non_zero_int( int16_t *v, int i_count )
#endif
-static inline void pixel_avg( uint8_t *dst, int i_dst_stride,
- uint8_t *src1, int i_src1_stride,
- uint8_t *src2, int i_src2_stride,
+static inline void pixel_avg( pixel *dst, int i_dst_stride,
+ pixel *src1, int i_src1_stride,
+ pixel *src2, int i_src2_stride,
int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
}
}
-static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height )
+static inline void pixel_avg_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height )
{
for( int y = 0; y < height; y++ )
{
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
-#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
-static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height, int i_weight1 )
+#define op_scale2(x) dst[x] = x264_clip_pixel( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
+static inline void pixel_avg_weight_wxh( pixel *dst, int i_dst, pixel *src1, int i_src1, pixel *src2, int i_src2, int width, int height, int i_weight1 )
{
const int i_weight2 = 64 - i_weight1;
for( int y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
#undef op_scale2
#define PIXEL_AVG_C( name, width, height ) \
-static void name( uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2, \
- uint8_t *pix3, int i_stride_pix3, int weight ) \
+static void name( pixel *pix1, int i_stride_pix1, \
+ pixel *pix2, int i_stride_pix2, \
+ pixel *pix3, int i_stride_pix3, int weight ) \
{ \
if( weight == 32 ) \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
{
w->weightfn = h->mc.weight;
}
-#define opscale(x) dst[x] = x264_clip_uint8( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
-#define opscale_noden(x) dst[x] = x264_clip_uint8( src[x] * weight->i_scale + weight->i_offset )
-static inline void mc_weight( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
+#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * weight->i_scale + weight->i_offset )
+static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
{
if( weight->i_denom >= 1 )
{
}
#define MC_WEIGHT_C( name, lx ) \
- static void name( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, const x264_weight_t *weight, int height ) \
+ static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
{ \
if( weight->i_denom >= 1 ) \
{ \
mc_weight_w20,
};
const x264_weight_t weight_none[3] = { {{0}} };
-static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
+static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, int i_width, int i_height )
{
for( int y = 0; y < i_height; y++ )
{
- memcpy( dst, src, i_width );
+ memcpy( dst, src, i_width * sizeof(pixel) );
src += i_src_stride;
dst += i_dst_stride;
}
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
-static void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
int stride, int width, int height, int16_t *buf )
{
for( int y = 0; y < height; y++ )
for( int x = -2; x < width+3; x++ )
{
int v = TAPFILTER(src,stride);
- dstv[x] = x264_clip_uint8( (v + 16) >> 5 );
+ dstv[x] = x264_clip_pixel( (v + 16) >> 5 );
buf[x+2] = v;
}
for( int x = 0; x < width; x++ )
- dstc[x] = x264_clip_uint8( (TAPFILTER(buf+2,1) + 512) >> 10 );
+ dstc[x] = x264_clip_pixel( (TAPFILTER(buf+2,1) + 512) >> 10 );
for( int x = 0; x < width; x++ )
- dsth[x] = x264_clip_uint8( (TAPFILTER(src,1) + 16) >> 5 );
+ dsth[x] = x264_clip_pixel( (TAPFILTER(src,1) + 16) >> 5 );
dsth += stride;
dstv += stride;
dstc += stride;
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-static void mc_luma( uint8_t *dst, int i_dst_stride,
- uint8_t *src[4], int i_src_stride,
+static void mc_luma( pixel *dst, int i_dst_stride,
+ pixel *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+ pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
if( weight->weightfn )
mc_copy( src1, i_src_stride, dst, i_dst_stride, i_width, i_height );
}
-static uint8_t *get_ref( uint8_t *dst, int *i_dst_stride,
- uint8_t *src[4], int i_src_stride,
- int mvx, int mvy,
- int i_width, int i_height, const x264_weight_t *weight )
+static pixel *get_ref( pixel *dst, int *i_dst_stride,
+ pixel *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+ pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
if( qpel_idx & 5 ) /* qpel interpolation needed */
{
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_width, i_height );
if( weight->weightfn )
}
/* full chroma mc (ie until 1/8 pixel)*/
-static void mc_chroma( uint8_t *dst, int i_dst_stride,
- uint8_t *src, int i_src_stride,
+static void mc_chroma( pixel *dst, int i_dst_stride,
+ pixel *src, int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
- uint8_t *srcp;
+ pixel *srcp;
int d8x = mvx&0x07;
int d8y = mvy&0x07;
}
#define MC_COPY(W) \
-static void mc_copy_w##W( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_height ) \
+static void mc_copy_w##W( pixel *dst, int i_dst, pixel *src, int i_src, int i_height ) \
{ \
mc_copy( src, i_src, dst, i_dst, W, i_height ); \
}
MC_COPY( 8 )
MC_COPY( 4 )
-void x264_plane_copy_c( uint8_t *dst, int i_dst,
+void x264_plane_copy_c( pixel *dst, int i_dst,
uint8_t *src, int i_src, int w, int h)
{
while( h-- )
}
}
-static void prefetch_fenc_null( uint8_t *pix_y, int stride_y,
- uint8_t *pix_uv, int stride_uv, int mb_x )
+static void prefetch_fenc_null( pixel *pix_y, int stride_y,
+ pixel *pix_uv, int stride_uv, int mb_x )
{}
-static void prefetch_ref_null( uint8_t *pix, int stride, int parity )
+static void prefetch_ref_null( pixel *pix, int stride, int parity )
{}
static void memzero_aligned( void * dst, int n )
memset( dst, 0, n );
}
-static void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+static void integral_init4h( uint16_t *sum, pixel *pix, int stride )
{
int v = pix[0]+pix[1]+pix[2]+pix[3];
for( int x = 0; x < stride-4; x++ )
}
}
-static void integral_init8h( uint16_t *sum, uint8_t *pix, int stride )
+static void integral_init8h( uint16_t *sum, pixel *pix, int stride )
{
int v = pix[0]+pix[1]+pix[2]+pix[3]+pix[4]+pix[5]+pix[6]+pix[7];
for( int x = 0; x < stride-8; x++ )
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
- uint8_t *src = frame->plane[0];
+ pixel *src = frame->plane[0];
int i_stride = frame->i_stride[0];
int i_height = frame->i_lines[0];
int i_width = frame->i_width[0];
// duplicate last row and column so that their interpolation doesn't have to be special-cased
for( int y = 0; y < i_height; y++ )
src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
- memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), i_width+1 );
+ memcpy( src+i_stride*i_height, src+i_stride*(i_height-1), (i_width+1) * sizeof(pixel) );
h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
x264_frame_expand_border_lowres( frame );
frame->lowres_mvs[y][x][0][0] = 0x7FFF;
}
-static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
int src_stride, int dst_stride, int width, int height )
{
for( int y = 0; y < height; y++ )
{
- uint8_t *src1 = src0+src_stride;
- uint8_t *src2 = src1+src_stride;
+ pixel *src1 = src0+src_stride;
+ pixel *src2 = src1+src_stride;
for( int x = 0; x<width; x++ )
{
// slower than naive bilinear, but matches asm
height += PADV-9;
for( int y = start; y < height; y++ )
{
- uint8_t *pix = frame->plane[0] + y * stride - PADH;
+ pixel *pix = frame->plane[0] + y * stride - PADH;
uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
uint16_t *sum4;
if( h->frames.b_have_sub8x8_esa )
#define X264_MC_H
struct x264_weight_t;
-typedef void (* weight_fn_t)( uint8_t *, int, uint8_t *,int, const struct x264_weight_t *, int );
+typedef void (* weight_fn_t)( pixel *, int, pixel *,int, const struct x264_weight_t *, int );
typedef struct x264_weight_t
{
/* aligning the first member is a gcc hack to force the struct to be
typedef struct
{
- void (*mc_luma)(uint8_t *dst, int i_dst, uint8_t **src, int i_src,
+ void (*mc_luma)(pixel *dst, int i_dst, pixel **src, int i_src,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight );
/* may round up the dimensions if they're not a power of 2 */
- uint8_t* (*get_ref)(uint8_t *dst, int *i_dst, uint8_t **src, int i_src,
- int mvx, int mvy,
- int i_width, int i_height, const x264_weight_t *weight );
+ pixel* (*get_ref)(pixel *dst, int *i_dst, pixel **src, int i_src,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
- void (*mc_chroma)(uint8_t *dst, int i_dst, uint8_t *src, int i_src,
+ void (*mc_chroma)(pixel *dst, int i_dst, pixel *src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
- void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
+ void (*avg[10])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
- void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
- void (*copy_16x16_unaligned)( uint8_t *dst, int, uint8_t *src, int, int i_height );
+ void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
+ void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
- void (*plane_copy)( uint8_t *dst, int i_dst,
+ void (*plane_copy)( pixel *dst, int i_dst,
uint8_t *src, int i_src, int w, int h);
- void (*hpel_filter)( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
int i_stride, int i_width, int i_height, int16_t *buf );
/* prefetch the next few macroblocks of fenc or fdec */
- void (*prefetch_fenc)( uint8_t *pix_y, int stride_y,
- uint8_t *pix_uv, int stride_uv, int mb_x );
+ void (*prefetch_fenc)( pixel *pix_y, int stride_y,
+ pixel *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
- void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+ void (*prefetch_ref)( pixel *pix, int stride, int parity );
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
/* successive elimination prefilter */
- void (*integral_init4h)( uint16_t *sum, uint8_t *pix, int stride );
- void (*integral_init8h)( uint16_t *sum, uint8_t *pix, int stride );
+ void (*integral_init4h)( uint16_t *sum, pixel *pix, int stride );
+ void (*integral_init8h)( uint16_t *sum, pixel *pix, int stride );
void (*integral_init4v)( uint16_t *sum8, uint16_t *sum4, int stride );
void (*integral_init8v)( uint16_t *sum8, int stride );
- void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+ void (*frame_init_lowres_core)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
int src_stride, int dst_stride, int width, int height );
weight_fn_t *weight;
weight_fn_t *offsetadd;
* pixel_sad_WxH
****************************************************************************/
#define PIXEL_SAD_C( name, lx, ly ) \
-static int name( uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2 ) \
+static int name( pixel *pix1, int i_stride_pix1, \
+ pixel *pix2, int i_stride_pix2 ) \
{ \
int i_sum = 0; \
for( int y = 0; y < ly; y++ ) \
* pixel_ssd_WxH
****************************************************************************/
#define PIXEL_SSD_C( name, lx, ly ) \
-static int name( uint8_t *pix1, int i_stride_pix1, \
- uint8_t *pix2, int i_stride_pix2 ) \
+static int name( pixel *pix1, int i_stride_pix1, \
+ pixel *pix2, int i_stride_pix2 ) \
{ \
int i_sum = 0; \
for( int y = 0; y < ly; y++ ) \
PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
-int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
+int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
{
int64_t i_ssd = 0;
int y;
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w ) \
-static uint64_t name( uint8_t *pix, int i_stride ) \
+static uint64_t name( pixel *pix, int i_stride ) \
{ \
uint32_t sum = 0, sqr = 0; \
for( int y = 0; y < w; y++ ) \
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
-static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
+static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd )
{
uint32_t var = 0, sum = 0, sqr = 0;
for( int y = 0; y < 8; y++ )
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
****************************************************************************/
-static NOINLINE int x264_pixel_satd_4x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
uint32_t tmp[4][2];
uint32_t a0, a1, a2, a3, b0, b1;
return sum >> 1;
}
-static NOINLINE int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
uint32_t tmp[4][4];
uint32_t a0, a1, a2, a3;
}
#define PIXEL_SATD_C( w, h, sub )\
-static int x264_pixel_satd_##w##x##h( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )\
+static int x264_pixel_satd_##w##x##h( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )\
{\
int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
+ sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 )
-static NOINLINE int sa8d_8x8( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
uint32_t tmp[8][4];
uint32_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
return sum;
}
-static int x264_pixel_sa8d_8x8( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static int x264_pixel_sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
return (sum+2)>>2;
}
-static int x264_pixel_sa8d_16x16( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
+ sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
}
-static NOINLINE uint64_t pixel_hadamard_ac( uint8_t *pix, int stride )
+static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride )
{
uint32_t tmp[32];
uint32_t a0, a1, a2, a3, dc;
}
#define HADAMARD_AC(w,h) \
-static uint64_t x264_pixel_hadamard_ac_##w##x##h( uint8_t *pix, int stride )\
+static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, int stride )\
{\
uint64_t sum = pixel_hadamard_ac( pix, stride );\
if( w==16 )\
* pixel_sad_x4
****************************************************************************/
#define SAD_X( size ) \
-static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
{\
scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
}\
-static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
{\
scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
****************************************************************************/
#define SATD_X( size, cpu ) \
-static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
+static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
{\
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
}\
-static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
+static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
{\
scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
/****************************************************************************
* structural similarity metric
****************************************************************************/
-static void ssim_4x4x2_core( const uint8_t *pix1, int stride1,
- const uint8_t *pix2, int stride2,
+static void ssim_4x4x2_core( const pixel *pix1, int stride1,
+ const pixel *pix2, int stride2,
int sums[2][4])
{
for( int z = 0; z < 2; z++ )
}
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
- uint8_t *pix1, int stride1,
- uint8_t *pix2, int stride2,
+ pixel *pix1, int stride1,
+ pixel *pix2, int stride2,
int width, int height, void *buf )
{
int z = 0;
// SSD assumes all args aligned
// other cmp functions assume first arg aligned
-typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
-typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
-typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
+typedef int (*x264_pixel_cmp_t) ( pixel *, int, pixel *, int );
+typedef void (*x264_pixel_cmp_x3_t) ( pixel *, pixel *, pixel *, pixel *, int, int[3] );
+typedef void (*x264_pixel_cmp_x4_t) ( pixel *, pixel *, pixel *, pixel *, pixel *, int, int[4] );
enum
{
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
- int (*var2_8x8)( uint8_t *, int, uint8_t *, int, int * );
+ int (*var2_8x8)( pixel *, int, pixel *, int, int * );
- uint64_t (*var[4])( uint8_t *pix, int stride );
- uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
+ uint64_t (*var[4])( pixel *pix, int stride );
+ uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
- void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
- const uint8_t *pix2, int stride2, int sums[2][4] );
+ void (*ssim_4x4x2_core)( const pixel *pix1, int stride1,
+ const pixel *pix2, int stride2, int sums[2][4] );
float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
/* multiple parallel calls to cmp. */
/* calculate satd or sad of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
- void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_mbcmp_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_sad_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
- void (*intra_mbcmp_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
- void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
- void (*intra_sad_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
+ void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_sad_x3_16x16) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] );
+ void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[33], int res[3] );
+ void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[33], int res[3] );
+ void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[33], int res[3] );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
-int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height );
-float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height, void *buf );
+int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
+float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf );
#endif
#define PREDICT_16x16_DC(v)\
for( int i = 0; i < 16; i++ )\
{\
- M32( src+ 0 ) = v;\
- M32( src+ 4 ) = v;\
- M32( src+ 8 ) = v;\
- M32( src+12 ) = v;\
+ MPIXEL_X4( src+ 0 ) = v;\
+ MPIXEL_X4( src+ 4 ) = v;\
+ MPIXEL_X4( src+ 8 ) = v;\
+ MPIXEL_X4( src+12 ) = v;\
src += FDEC_STRIDE;\
}
-static void predict_16x16_dc( uint8_t *src )
+static void predict_16x16_dc( pixel *src )
{
- uint32_t dc = 0;
+ pixel4 dc = 0;
for( int i = 0; i < 16; i++ )
{
dc += src[-1 + i * FDEC_STRIDE];
dc += src[i - FDEC_STRIDE];
}
- dc = (( dc + 16 ) >> 5) * 0x01010101;
+ dc = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );
- PREDICT_16x16_DC(dc);
+ PREDICT_16x16_DC( dc );
}
-static void predict_16x16_dc_left( uint8_t *src )
+static void predict_16x16_dc_left( pixel *src )
{
- uint32_t dc = 0;
+ pixel4 dc = 0;
for( int i = 0; i < 16; i++ )
dc += src[-1 + i * FDEC_STRIDE];
- dc = (( dc + 8 ) >> 4) * 0x01010101;
+ dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
- PREDICT_16x16_DC(dc);
+ PREDICT_16x16_DC( dc );
}
-static void predict_16x16_dc_top( uint8_t *src )
+static void predict_16x16_dc_top( pixel *src )
{
- uint32_t dc = 0;
+ pixel4 dc = 0;
for( int i = 0; i < 16; i++ )
dc += src[i - FDEC_STRIDE];
- dc = (( dc + 8 ) >> 4) * 0x01010101;
+ dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
- PREDICT_16x16_DC(dc);
+ PREDICT_16x16_DC( dc );
}
-static void predict_16x16_dc_128( uint8_t *src )
+static void predict_16x16_dc_128( pixel *src )
{
- PREDICT_16x16_DC(0x80808080);
+ PREDICT_16x16_DC( PIXEL_SPLAT_X4( 0x80 ) );
}
-static void predict_16x16_h( uint8_t *src )
+static void predict_16x16_h( pixel *src )
{
for( int i = 0; i < 16; i++ )
{
- const uint32_t v = 0x01010101 * src[-1];
- M32( src+ 0 ) = v;
- M32( src+ 4 ) = v;
- M32( src+ 8 ) = v;
- M32( src+12 ) = v;
+ const pixel4 v = PIXEL_SPLAT_X4( src[-1] );
+ MPIXEL_X4( src+ 0 ) = v;
+ MPIXEL_X4( src+ 4 ) = v;
+ MPIXEL_X4( src+ 8 ) = v;
+ MPIXEL_X4( src+12 ) = v;
src += FDEC_STRIDE;
}
}
-static void predict_16x16_v( uint8_t *src )
+static void predict_16x16_v( pixel *src )
{
- uint32_t v0 = M32( &src[ 0-FDEC_STRIDE] );
- uint32_t v1 = M32( &src[ 4-FDEC_STRIDE] );
- uint32_t v2 = M32( &src[ 8-FDEC_STRIDE] );
- uint32_t v3 = M32( &src[12-FDEC_STRIDE] );
+ pixel4 v0 = MPIXEL_X4( &src[ 0-FDEC_STRIDE] );
+ pixel4 v1 = MPIXEL_X4( &src[ 4-FDEC_STRIDE] );
+ pixel4 v2 = MPIXEL_X4( &src[ 8-FDEC_STRIDE] );
+ pixel4 v3 = MPIXEL_X4( &src[12-FDEC_STRIDE] );
for( int i = 0; i < 16; i++ )
{
- M32( src+ 0 ) = v0;
- M32( src+ 4 ) = v1;
- M32( src+ 8 ) = v2;
- M32( src+12 ) = v3;
+ MPIXEL_X4( src+ 0 ) = v0;
+ MPIXEL_X4( src+ 4 ) = v1;
+ MPIXEL_X4( src+ 8 ) = v2;
+ MPIXEL_X4( src+12 ) = v3;
src += FDEC_STRIDE;
}
}
-static void predict_16x16_p( uint8_t *src )
+static void predict_16x16_p( pixel *src )
{
int H = 0, V = 0;
int pix = i00;
for( int x = 0; x < 16; x++ )
{
- src[x] = x264_clip_uint8( pix>>5 );
+ src[x] = x264_clip_pixel( pix>>5 );
pix += b;
}
src += FDEC_STRIDE;
* 8x8 prediction for intra chroma block
****************************************************************************/
-static void predict_8x8c_dc_128( uint8_t *src )
+static void predict_8x8c_dc_128( pixel *src )
{
for( int y = 0; y < 8; y++ )
{
- M32( src+0 ) = 0x80808080;
- M32( src+4 ) = 0x80808080;
+ MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 0x80 );
+ MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 0x80 );
src += FDEC_STRIDE;
}
}
-static void predict_8x8c_dc_left( uint8_t *src )
+static void predict_8x8c_dc_left( pixel *src )
{
- uint32_t dc0 = 0, dc1 = 0;
+ pixel4 dc0 = 0, dc1 = 0;
for( int y = 0; y < 4; y++ )
{
dc0 += src[y * FDEC_STRIDE - 1];
dc1 += src[(y+4) * FDEC_STRIDE - 1];
}
- dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
- dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+ dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+ dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
for( int y = 0; y < 4; y++ )
{
- M32( src+0 ) = dc0;
- M32( src+4 ) = dc0;
+ MPIXEL_X4( src+0 ) = dc0;
+ MPIXEL_X4( src+4 ) = dc0;
src += FDEC_STRIDE;
}
for( int y = 0; y < 4; y++ )
{
- M32( src+0 ) = dc1;
- M32( src+4 ) = dc1;
+ MPIXEL_X4( src+0 ) = dc1;
+ MPIXEL_X4( src+4 ) = dc1;
src += FDEC_STRIDE;
}
}
-static void predict_8x8c_dc_top( uint8_t *src )
+static void predict_8x8c_dc_top( pixel *src )
{
- uint32_t dc0 = 0, dc1 = 0;
+ pixel4 dc0 = 0, dc1 = 0;
for( int x = 0; x < 4; x++ )
{
dc0 += src[x - FDEC_STRIDE];
dc1 += src[x + 4 - FDEC_STRIDE];
}
- dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
- dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
+ dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+ dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
for( int y = 0; y < 8; y++ )
{
- M32( src+0 ) = dc0;
- M32( src+4 ) = dc1;
+ MPIXEL_X4( src+0 ) = dc0;
+ MPIXEL_X4( src+4 ) = dc1;
src += FDEC_STRIDE;
}
}
-static void predict_8x8c_dc( uint8_t *src )
+static void predict_8x8c_dc( pixel *src )
{
int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
dc0 dc1
dc2 dc3
*/
- uint32_t dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101;
- uint32_t dc1 = (( s1 + 2 ) >> 2)*0x01010101;
- uint32_t dc2 = (( s3 + 2 ) >> 2)*0x01010101;
- uint32_t dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101;
+ pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
+ pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
+ pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
+ pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
for( int y = 0; y < 4; y++ )
{
- M32( src+0 ) = dc0;
- M32( src+4 ) = dc1;
+ MPIXEL_X4( src+0 ) = dc0;
+ MPIXEL_X4( src+4 ) = dc1;
src += FDEC_STRIDE;
}
for( int y = 0; y < 4; y++ )
{
- M32( src+0 ) = dc2;
- M32( src+4 ) = dc3;
+ MPIXEL_X4( src+0 ) = dc2;
+ MPIXEL_X4( src+4 ) = dc3;
src += FDEC_STRIDE;
}
}
-static void predict_8x8c_h( uint8_t *src )
+static void predict_8x8c_h( pixel *src )
{
for( int i = 0; i < 8; i++ )
{
- uint32_t v = 0x01010101 * src[-1];
- M32( src+0 ) = v;
- M32( src+4 ) = v;
+ pixel4 v = PIXEL_SPLAT_X4( src[-1] );
+ MPIXEL_X4( src+0 ) = v;
+ MPIXEL_X4( src+4 ) = v;
src += FDEC_STRIDE;
}
}
-static void predict_8x8c_v( uint8_t *src )
+static void predict_8x8c_v( pixel *src )
{
- uint32_t v0 = M32( src+0-FDEC_STRIDE );
- uint32_t v1 = M32( src+4-FDEC_STRIDE );
+ pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
+ pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );
for( int i = 0; i < 8; i++ )
{
- M32( src+0 ) = v0;
- M32( src+4 ) = v1;
+ MPIXEL_X4( src+0 ) = v0;
+ MPIXEL_X4( src+4 ) = v1;
src += FDEC_STRIDE;
}
}
-static void predict_8x8c_p( uint8_t *src )
+static void predict_8x8c_p( pixel *src )
{
int H = 0, V = 0;
int pix = i00;
for( int x = 0; x < 8; x++ )
{
- src[x] = x264_clip_uint8( pix>>5 );
+ src[x] = x264_clip_pixel( pix>>5 );
pix += b;
}
src += FDEC_STRIDE;
****************************************************************************/
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-#define SRC32(x,y) M32( &SRC(x,y) )
+#define SRC_X4(x,y) MPIXEL_X4( &SRC(x,y) )
#define PREDICT_4x4_DC(v)\
- SRC32(0,0) = SRC32(0,1) = SRC32(0,2) = SRC32(0,3) = v;
+ SRC_X4(0,0) = SRC_X4(0,1) = SRC_X4(0,2) = SRC_X4(0,3) = v;
-static void predict_4x4_dc_128( uint8_t *src )
+static void predict_4x4_dc_128( pixel *src )
{
- PREDICT_4x4_DC(0x80808080);
+ PREDICT_4x4_DC( PIXEL_SPLAT_X4( 0x80 ) );
}
-static void predict_4x4_dc_left( uint8_t *src )
+static void predict_4x4_dc_left( pixel *src )
{
- uint32_t dc = ((SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) + 2) >> 2) * 0x01010101;
- PREDICT_4x4_DC(dc);
+ pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) + 2) >> 2 );
+ PREDICT_4x4_DC( dc );
}
-static void predict_4x4_dc_top( uint8_t *src )
+static void predict_4x4_dc_top( pixel *src )
{
- uint32_t dc = ((SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 2) >> 2) * 0x01010101;
- PREDICT_4x4_DC(dc);
+ pixel4 dc = PIXEL_SPLAT_X4( (SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 2) >> 2 );
+ PREDICT_4x4_DC( dc );
}
-static void predict_4x4_dc( uint8_t *src )
+static void predict_4x4_dc( pixel *src )
{
- uint32_t dc = ((SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) +
- SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 4) >> 3) * 0x01010101;
- PREDICT_4x4_DC(dc);
+ pixel4 dc = PIXEL_SPLAT_X4( (SRC(-1,0) + SRC(-1,1) + SRC(-1,2) + SRC(-1,3) +
+ SRC(0,-1) + SRC(1,-1) + SRC(2,-1) + SRC(3,-1) + 4) >> 3 );
+ PREDICT_4x4_DC( dc );
}
-static void predict_4x4_h( uint8_t *src )
+static void predict_4x4_h( pixel *src )
{
- SRC32(0,0) = SRC(-1,0) * 0x01010101;
- SRC32(0,1) = SRC(-1,1) * 0x01010101;
- SRC32(0,2) = SRC(-1,2) * 0x01010101;
- SRC32(0,3) = SRC(-1,3) * 0x01010101;
+ SRC_X4(0,0) = PIXEL_SPLAT_X4( SRC(-1,0) );
+ SRC_X4(0,1) = PIXEL_SPLAT_X4( SRC(-1,1) );
+ SRC_X4(0,2) = PIXEL_SPLAT_X4( SRC(-1,2) );
+ SRC_X4(0,3) = PIXEL_SPLAT_X4( SRC(-1,3) );
}
-static void predict_4x4_v( uint8_t *src )
+static void predict_4x4_v( pixel *src )
{
- PREDICT_4x4_DC(SRC32(0,-1));
+ PREDICT_4x4_DC(SRC_X4(0,-1));
}
#define PREDICT_4x4_LOAD_LEFT\
#define F1(a,b) (((a)+(b)+1)>>1)
#define F2(a,b,c) (((a)+2*(b)+(c)+2)>>2)
-static void predict_4x4_ddl( uint8_t *src )
+static void predict_4x4_ddl( pixel *src )
{
PREDICT_4x4_LOAD_TOP
PREDICT_4x4_LOAD_TOP_RIGHT
SRC(3,2)=SRC(2,3)= F2(t5,t6,t7);
SRC(3,3)= F2(t6,t7,t7);
}
-static void predict_4x4_ddr( uint8_t *src )
+static void predict_4x4_ddr( pixel *src )
{
int lt = SRC(-1,-1);
PREDICT_4x4_LOAD_LEFT
SRC(0,3)= F2(l1,l2,l3);
}
-static void predict_4x4_vr( uint8_t *src )
+static void predict_4x4_vr( pixel *src )
{
int lt = SRC(-1,-1);
PREDICT_4x4_LOAD_LEFT
SRC(3,0)= F1(t2,t3);
}
-static void predict_4x4_hd( uint8_t *src )
+static void predict_4x4_hd( pixel *src )
{
int lt= SRC(-1,-1);
PREDICT_4x4_LOAD_LEFT
SRC(3,0)= F2(t2,t1,t0);
}
-static void predict_4x4_vl( uint8_t *src )
+static void predict_4x4_vl( pixel *src )
{
PREDICT_4x4_LOAD_TOP
PREDICT_4x4_LOAD_TOP_RIGHT
SRC(3,3)= F2(t4,t5,t6);
}
-static void predict_4x4_hu( uint8_t *src )
+static void predict_4x4_hu( pixel *src )
{
PREDICT_4x4_LOAD_LEFT
SRC(0,0)= F1(l0,l1);
#define PT(x) \
edge[16+x] = F2(SRC(x-1,-1), SRC(x,-1), SRC(x+1,-1));
-static void predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
+static void predict_8x8_filter( pixel *src, pixel edge[33], int i_neighbor, int i_filters )
{
/* edge[7..14] = l7..l0
* edge[15] = lt
#define PREDICT_8x8_DC(v) \
for( int y = 0; y < 8; y++ ) { \
- M32( src+0 ) = v; \
- M32( src+4 ) = v; \
+ MPIXEL_X4( src+0 ) = v; \
+ MPIXEL_X4( src+4 ) = v; \
src += FDEC_STRIDE; \
}
-static void predict_8x8_dc_128( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_dc_128( pixel *src, pixel edge[33] )
{
- PREDICT_8x8_DC(0x80808080);
+ PREDICT_8x8_DC( PIXEL_SPLAT_X4( 0x80 ) );
}
-static void predict_8x8_dc_left( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_dc_left( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_LEFT
- uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
- PREDICT_8x8_DC(dc);
+ pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3 );
+ PREDICT_8x8_DC( dc );
}
-static void predict_8x8_dc_top( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_dc_top( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_TOP
- uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
- PREDICT_8x8_DC(dc);
+ pixel4 dc = PIXEL_SPLAT_X4( (t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3 );
+ PREDICT_8x8_DC( dc );
}
-static void predict_8x8_dc( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_dc( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_LEFT
PREDICT_8x8_LOAD_TOP
- uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
- +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
- PREDICT_8x8_DC(dc);
+ pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4 );
+ PREDICT_8x8_DC( dc );
}
-static void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_h( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_LEFT
-#define ROW(y) M32( src+y*FDEC_STRIDE+0 ) =\
- M32( src+y*FDEC_STRIDE+4 ) = 0x01010101U * l##y;
+#define ROW(y) MPIXEL_X4( src+y*FDEC_STRIDE+0 ) =\
+ MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = PIXEL_SPLAT_X4( l##y );
ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW
}
-static void predict_8x8_v( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_v( pixel *src, pixel edge[33] )
{
uint64_t top = M64( edge+16 );
for( int y = 0; y < 8; y++ )
M64( src+y*FDEC_STRIDE ) = top;
}
-static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_ddl( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_TOPRIGHT
SRC(6,7)=SRC(7,6)= F2(t13,t14,t15);
SRC(7,7)= F2(t14,t15,t15);
}
-static void predict_8x8_ddr( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_ddr( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_LEFT
SRC(7,0)= F2(t5,t6,t7);
}
-static void predict_8x8_vr( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_vr( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_LEFT
SRC(7,1)= F2(t5,t6,t7);
SRC(7,0)= F1(t6,t7);
}
-static void predict_8x8_hd( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_hd( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_LEFT
PREDICT_8x8_LOAD_TOPLEFT
- int p1 = pack8to16(F1(l6,l7), F2(l5,l6,l7));
- int p2 = pack8to16(F1(l5,l6), F2(l4,l5,l6));
- int p3 = pack8to16(F1(l4,l5), F2(l3,l4,l5));
- int p4 = pack8to16(F1(l3,l4), F2(l2,l3,l4));
- int p5 = pack8to16(F1(l2,l3), F2(l1,l2,l3));
- int p6 = pack8to16(F1(l1,l2), F2(l0,l1,l2));
- int p7 = pack8to16(F1(l0,l1), F2(lt,l0,l1));
- int p8 = pack8to16(F1(lt,l0), F2(l0,lt,t0));
- int p9 = pack8to16(F2(t1,t0,lt), F2(t2,t1,t0));
- int p10 = pack8to16(F2(t3,t2,t1), F2(t4,t3,t2));
- int p11 = pack8to16(F2(t5,t4,t3), F2(t6,t5,t4));
- SRC32(0,7)= pack16to32(p1,p2);
- SRC32(0,6)= pack16to32(p2,p3);
- SRC32(4,7)=SRC32(0,5)= pack16to32(p3,p4);
- SRC32(4,6)=SRC32(0,4)= pack16to32(p4,p5);
- SRC32(4,5)=SRC32(0,3)= pack16to32(p5,p6);
- SRC32(4,4)=SRC32(0,2)= pack16to32(p6,p7);
- SRC32(4,3)=SRC32(0,1)= pack16to32(p7,p8);
- SRC32(4,2)=SRC32(0,0)= pack16to32(p8,p9);
- SRC32(4,1)= pack16to32(p9,p10);
- SRC32(4,0)= pack16to32(p10,p11);
-}
-static void predict_8x8_vl( uint8_t *src, uint8_t edge[33] )
+ int p1 = pack_pixel_1to2(F1(l6,l7), F2(l5,l6,l7));
+ int p2 = pack_pixel_1to2(F1(l5,l6), F2(l4,l5,l6));
+ int p3 = pack_pixel_1to2(F1(l4,l5), F2(l3,l4,l5));
+ int p4 = pack_pixel_1to2(F1(l3,l4), F2(l2,l3,l4));
+ int p5 = pack_pixel_1to2(F1(l2,l3), F2(l1,l2,l3));
+ int p6 = pack_pixel_1to2(F1(l1,l2), F2(l0,l1,l2));
+ int p7 = pack_pixel_1to2(F1(l0,l1), F2(lt,l0,l1));
+ int p8 = pack_pixel_1to2(F1(lt,l0), F2(l0,lt,t0));
+ int p9 = pack_pixel_1to2(F2(t1,t0,lt), F2(t2,t1,t0));
+ int p10 = pack_pixel_1to2(F2(t3,t2,t1), F2(t4,t3,t2));
+ int p11 = pack_pixel_1to2(F2(t5,t4,t3), F2(t6,t5,t4));
+ SRC_X4(0,7)= pack_pixel_2to4(p1,p2);
+ SRC_X4(0,6)= pack_pixel_2to4(p2,p3);
+ SRC_X4(4,7)=SRC_X4(0,5)= pack_pixel_2to4(p3,p4);
+ SRC_X4(4,6)=SRC_X4(0,4)= pack_pixel_2to4(p4,p5);
+ SRC_X4(4,5)=SRC_X4(0,3)= pack_pixel_2to4(p5,p6);
+ SRC_X4(4,4)=SRC_X4(0,2)= pack_pixel_2to4(p6,p7);
+ SRC_X4(4,3)=SRC_X4(0,1)= pack_pixel_2to4(p7,p8);
+ SRC_X4(4,2)=SRC_X4(0,0)= pack_pixel_2to4(p8,p9);
+ SRC_X4(4,1)= pack_pixel_2to4(p9,p10);
+ SRC_X4(4,0)= pack_pixel_2to4(p10,p11);
+}
+static void predict_8x8_vl( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_TOPRIGHT
SRC(7,6)= F1(t10,t11);
SRC(7,7)= F2(t10,t11,t12);
}
-static void predict_8x8_hu( uint8_t *src, uint8_t edge[33] )
+static void predict_8x8_hu( pixel *src, pixel edge[33] )
{
PREDICT_8x8_LOAD_LEFT
- int p1 = pack8to16(F1(l0,l1), F2(l0,l1,l2));
- int p2 = pack8to16(F1(l1,l2), F2(l1,l2,l3));
- int p3 = pack8to16(F1(l2,l3), F2(l2,l3,l4));
- int p4 = pack8to16(F1(l3,l4), F2(l3,l4,l5));
- int p5 = pack8to16(F1(l4,l5), F2(l4,l5,l6));
- int p6 = pack8to16(F1(l5,l6), F2(l5,l6,l7));
- int p7 = pack8to16(F1(l6,l7), F2(l6,l7,l7));
- int p8 = pack8to16(l7,l7);
- SRC32(0,0)= pack16to32(p1,p2);
- SRC32(0,1)= pack16to32(p2,p3);
- SRC32(4,0)=SRC32(0,2)= pack16to32(p3,p4);
- SRC32(4,1)=SRC32(0,3)= pack16to32(p4,p5);
- SRC32(4,2)=SRC32(0,4)= pack16to32(p5,p6);
- SRC32(4,3)=SRC32(0,5)= pack16to32(p6,p7);
- SRC32(4,4)=SRC32(0,6)= pack16to32(p7,p8);
- SRC32(4,5)=SRC32(4,6)= SRC32(0,7) = SRC32(4,7) = pack16to32(p8,p8);
+ int p1 = pack_pixel_1to2(F1(l0,l1), F2(l0,l1,l2));
+ int p2 = pack_pixel_1to2(F1(l1,l2), F2(l1,l2,l3));
+ int p3 = pack_pixel_1to2(F1(l2,l3), F2(l2,l3,l4));
+ int p4 = pack_pixel_1to2(F1(l3,l4), F2(l3,l4,l5));
+ int p5 = pack_pixel_1to2(F1(l4,l5), F2(l4,l5,l6));
+ int p6 = pack_pixel_1to2(F1(l5,l6), F2(l5,l6,l7));
+ int p7 = pack_pixel_1to2(F1(l6,l7), F2(l6,l7,l7));
+ int p8 = pack_pixel_1to2(l7,l7);
+ SRC_X4(0,0)= pack_pixel_2to4(p1,p2);
+ SRC_X4(0,1)= pack_pixel_2to4(p2,p3);
+ SRC_X4(4,0)=SRC_X4(0,2)= pack_pixel_2to4(p3,p4);
+ SRC_X4(4,1)=SRC_X4(0,3)= pack_pixel_2to4(p4,p5);
+ SRC_X4(4,2)=SRC_X4(0,4)= pack_pixel_2to4(p5,p6);
+ SRC_X4(4,3)=SRC_X4(0,5)= pack_pixel_2to4(p6,p7);
+ SRC_X4(4,4)=SRC_X4(0,6)= pack_pixel_2to4(p7,p8);
+ SRC_X4(4,5)=SRC_X4(4,6)= SRC_X4(0,7) = SRC_X4(4,7) = pack_pixel_2to4(p8,p8);
}
/****************************************************************************
#ifndef X264_PREDICT_H
#define X264_PREDICT_H
-typedef void (*x264_predict_t)( uint8_t *src );
-typedef void (*x264_predict8x8_t)( uint8_t *src, uint8_t edge[33] );
-typedef void (*x264_predict_8x8_filter_t) ( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+typedef void (*x264_predict_t)( pixel *src );
+typedef void (*x264_predict8x8_t)( pixel *src, pixel edge[33] );
+typedef void (*x264_predict_8x8_filter_t) ( pixel *src, pixel edge[33], int i_neighbor, int i_filters );
enum intra_chroma_pred_e
{
static const int zoom = 2; /* Zoom factor */
static const int pad = 32;
- uint8_t *const frame = h->fdec->plane[0];
+ pixel *const frame = h->fdec->plane[0];
const int width = h->param.i_width;
const int height = h->param.i_height;
const int stride = h->fdec->i_stride[0];
int width = frame->i_width[0] + 2*PADH;
int i_padv = PADV << h->param.b_interlaced;
int offset, height;
- uint8_t *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
+ pixel *src = frame->filtered[0] - frame->i_stride[0]*i_padv - PADH;
height = X264_MIN( 16 + end + i_padv, h->fref0[j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
offset = h->fenc->i_lines_weighted*frame->i_stride[0];
h->fenc->i_lines_weighted += height;
for( int k = j; k < h->i_ref0; k++ )
if( h->sh.weight[k][0].weightfn )
{
- uint8_t *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+ pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
src + offset, frame->i_stride[0],
width, height, &h->sh.weight[k][0] );
/* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
{
- ALIGNED_16( static uint8_t zero[16*FDEC_STRIDE] ) = {0};
+ ALIGNED_16( static pixel zero[16*FDEC_STRIDE] ) = {0};
if( do_both_dct || h->mb.b_transform_8x8 )
h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter )
{
const unsigned int flags = h->sh.i_type == SLICE_TYPE_I ? h->param.analyse.intra : h->param.analyse.inter;
- uint8_t *p_src = h->mb.pic.p_fenc[0];
- uint8_t *p_dst = h->mb.pic.p_fdec[0];
+ pixel *p_src = h->mb.pic.p_fenc[0];
+ pixel *p_dst = h->mb.pic.p_fdec[0];
int idx;
int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16 && !h->mb.b_lossless;
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
- ALIGNED_ARRAY_16( uint8_t, edge,[33] );
+ ALIGNED_ARRAY_16( pixel, edge,[33] );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
{
int x = idx&1;
int y = idx>>1;
- uint8_t *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
- uint8_t *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
+ pixel *p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
+ pixel *p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
int i_best = COST_MAX;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
for( idx = 0;; idx++ )
{
- uint8_t *p_src_by = p_src + block_idx_xy_fenc[idx];
- uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
+ pixel *p_src_by = p_src + block_idx_xy_fenc[idx];
+ pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
int i_best = COST_MAX;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, idx );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+ MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
if( b_merged_satd && predict_mode[5] >= 0 )
{
static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
{
- uint8_t *p_dst = h->mb.pic.p_fdec[0];
+ pixel *p_dst = h->mb.pic.p_fdec[0];
uint64_t i_satd, i_best;
h->mb.i_skip_intra = 0;
if( h->mb.i_type == I_4x4 )
{
- uint32_t pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
+ pixel4 pels[4] = {0}; // doesn't need initting, just shuts up a gcc warning
int i_nnz = 0;
for( int idx = 0; idx < 16; idx++ )
{
- uint8_t *p_dst_by = p_dst + block_idx_xy_fdec[idx];
+ pixel *p_dst_by = p_dst + block_idx_xy_fdec[idx];
i_best = COST_MAX64;
predict_mode = predict_4x4_mode_available( h->mb.i_neighbour4[idx] );
if( (h->mb.i_neighbour4[idx] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- M32( &p_dst_by[4 - FDEC_STRIDE] ) = p_dst_by[3 - FDEC_STRIDE] * 0x01010101U;
+ MPIXEL_X4( &p_dst_by[4 - FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst_by[3 - FDEC_STRIDE] );
for( ; *predict_mode >= 0; predict_mode++ )
{
{
a->i_predict4x4[idx] = i_mode;
i_best = i_satd;
- pels[0] = M32( p_dst_by+0*FDEC_STRIDE );
- pels[1] = M32( p_dst_by+1*FDEC_STRIDE );
- pels[2] = M32( p_dst_by+2*FDEC_STRIDE );
- pels[3] = M32( p_dst_by+3*FDEC_STRIDE );
+ pels[0] = MPIXEL_X4( p_dst_by+0*FDEC_STRIDE );
+ pels[1] = MPIXEL_X4( p_dst_by+1*FDEC_STRIDE );
+ pels[2] = MPIXEL_X4( p_dst_by+2*FDEC_STRIDE );
+ pels[3] = MPIXEL_X4( p_dst_by+3*FDEC_STRIDE );
i_nnz = h->mb.cache.non_zero_count[x264_scan8[idx]];
}
}
- M32( p_dst_by+0*FDEC_STRIDE ) = pels[0];
- M32( p_dst_by+1*FDEC_STRIDE ) = pels[1];
- M32( p_dst_by+2*FDEC_STRIDE ) = pels[2];
- M32( p_dst_by+3*FDEC_STRIDE ) = pels[3];
+ MPIXEL_X4( p_dst_by+0*FDEC_STRIDE ) = pels[0];
+ MPIXEL_X4( p_dst_by+1*FDEC_STRIDE ) = pels[1];
+ MPIXEL_X4( p_dst_by+2*FDEC_STRIDE ) = pels[2];
+ MPIXEL_X4( p_dst_by+3*FDEC_STRIDE ) = pels[3];
h->mb.cache.non_zero_count[x264_scan8[idx]] = i_nnz;
h->mb.cache.intra4x4_pred_mode[x264_scan8[idx]] = a->i_predict4x4[idx];
}
else if( h->mb.i_type == I_8x8 )
{
- ALIGNED_ARRAY_16( uint8_t, edge,[33] );
+ ALIGNED_ARRAY_16( pixel, edge,[33] );
for( int idx = 0; idx < 4; idx++ )
{
- uint64_t pels_h = 0;
- uint8_t pels_v[7];
+ pixel4 pels_h[2] = {0};
+ pixel pels_v[7];
uint16_t i_nnz[2] = {0}; //shut up gcc
- uint8_t *p_dst_by;
+ pixel *p_dst_by;
int cbp_luma_new = 0;
int i_thresh = a->i_satd_i8x8_dir[a->i_predict8x8[idx]][idx] * 11/8;
cbp_luma_new = h->mb.i_cbp_luma;
i_best = i_satd;
- pels_h = M64( p_dst_by+7*FDEC_STRIDE );
+ pels_h[0] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 );
+ pels_h[1] = MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 );
if( !(idx&1) )
for( int j = 0; j < 7; j++ )
pels_v[j] = p_dst_by[7+j*FDEC_STRIDE];
}
}
a->i_cbp_i8x8_luma = cbp_luma_new;
- M64( p_dst_by+7*FDEC_STRIDE ) = pels_h;
+ MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+0 ) = pels_h[0];
+ MPIXEL_X4( p_dst_by+7*FDEC_STRIDE+4 ) = pels_h[1];
if( !(idx&1) )
for( int j = 0; j < 7; j++ )
p_dst_by[7+j*FDEC_STRIDE] = pels_v[j];
static void x264_mb_analyse_inter_p8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
{
x264_me_t m;
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fenc = h->mb.pic.p_fenc;
int i_maxref = h->mb.pic.i_fref[0]-1;
h->mb.i_partition = D_8x8;
* don't bother analysing the dupes. */
const int i_ref = h->mb.ref_blind_dupe == a->l0.me16x16.i_ref ? 0 : a->l0.me16x16.i_ref;
const int i_ref_cost = h->param.b_cabac || i_ref ? REF_COST( 0, i_ref ) : 0;
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fenc = h->mb.pic.p_fenc;
int i_mvc;
int16_t (*mvc)[2] = a->l0.mvc[i_ref];
static void x264_mb_analyse_inter_p16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
x264_me_t m;
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fenc = h->mb.pic.p_fenc;
ALIGNED_4( int16_t mvc[3][2] );
/* XXX Needed for x264_mb_predict_mv */
static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
x264_me_t m;
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fenc = h->mb.pic.p_fenc;
ALIGNED_4( int16_t mvc[3][2] );
/* XXX Needed for x264_mb_predict_mv */
a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
}
-static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, uint8_t **p_fref, int i8x8, int pixel )
+static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
{
- ALIGNED_ARRAY_8( uint8_t, pix1,[16*8] );
- uint8_t *pix2 = pix1+8;
+ ALIGNED_ARRAY_8( pixel, pix1,[16*8] );
+ pixel *pix2 = pix1+8;
const int i_stride = h->mb.pic.i_stride[1];
const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
- if( pixel == PIXEL_4x4 )
+ if( size == PIXEL_4x4 )
{
x264_me_t *m = a->l0.me4x4[i8x8];
CHROMA4x4MC( 2,2, m[0], 0,0 );
CHROMA4x4MC( 2,2, m[2], 0,2 );
CHROMA4x4MC( 2,2, m[3], 2,2 );
}
- else if( pixel == PIXEL_8x4 )
+ else if( size == PIXEL_8x4 )
{
x264_me_t *m = a->l0.me8x4[i8x8];
CHROMA4x4MC( 4,2, m[0], 0,0 );
static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
- uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+ pixel **p_fenc = h->mb.pic.p_fenc;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
/* XXX Needed for x264_mb_predict_mv */
static void x264_mb_analyse_inter_p8x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
- uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+ pixel **p_fenc = h->mb.pic.p_fenc;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
/* XXX Needed for x264_mb_predict_mv */
static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
- uint8_t **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
- uint8_t **p_fenc = h->mb.pic.p_fenc;
+ pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
+ pixel **p_fenc = h->mb.pic.p_fenc;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
/* XXX Needed for x264_mb_predict_mv */
/* Assumes that fdec still contains the results of
* x264_mb_predict_mv_direct16x16 and x264_mb_mc */
- uint8_t *p_fenc = h->mb.pic.p_fenc[0];
- uint8_t *p_fdec = h->mb.pic.p_fdec[0];
+ pixel *p_fenc = h->mb.pic.p_fenc[0];
+ pixel *p_fdec = h->mb.pic.p_fdec[0];
a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
- ALIGNED_ARRAY_16( uint8_t, pix0,[16*16] );
- ALIGNED_ARRAY_16( uint8_t, pix1,[16*16] );
- uint8_t *src0, *src1;
+ ALIGNED_ARRAY_16( pixel, pix0,[16*16] );
+ ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
+ pixel *src0, *src1;
int stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
ALIGNED_4( int16_t mvc[9][2] );
static void x264_mb_analyse_inter_b8x8_mixed_ref( x264_t *h, x264_mb_analysis_t *a )
{
- ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
+ ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
int i_maxref[2] = {h->mb.pic.i_fref[0]-1, h->mb.pic.i_fref[1]-1};
/* early termination: if 16x16 chose ref 0, then evalute no refs older
int i_part_cost;
int i_part_cost_bi;
int stride[2] = {8,8};
- uint8_t *src[2];
+ pixel *src[2];
x264_me_t m;
m.i_pixel = PIXEL_8x8;
LOAD_FENC( &m, h->mb.pic.p_fenc, 8*x8, 8*y8 );
static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
{
- uint8_t **p_fref[2] =
+ pixel **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.me16x16.i_ref],
h->mb.pic.p_fref[1][a->l1.me16x16.i_ref] };
- ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*8] );
+ ALIGNED_ARRAY_8( pixel, pix,[2],[8*8] );
/* XXX Needed for x264_mb_predict_mv */
h->mb.i_partition = D_8x8;
int i_part_cost;
int i_part_cost_bi = 0;
int stride[2] = {8,8};
- uint8_t *src[2];
+ pixel *src[2];
for( int l = 0; l < 2; l++ )
{
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
- ALIGNED_ARRAY_16( uint8_t, pix,[2],[16*8] );
+ ALIGNED_ARRAY_16( pixel, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[3][2] );
h->mb.i_partition = D_16x8;
int i_part_cost;
int i_part_cost_bi = 0;
int stride[2] = {16,16};
- uint8_t *src[2];
+ pixel *src[2];
x264_me_t m;
m.i_pixel = PIXEL_16x8;
LOAD_FENC( &m, h->mb.pic.p_fenc, 0, 8*i );
static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
- ALIGNED_ARRAY_8( uint8_t, pix,[2],[8*16] );
+ ALIGNED_ARRAY_8( pixel, pix,[2],[8*16] );
ALIGNED_4( int16_t mvc[3][2] );
h->mb.i_partition = D_8x16;
int i_part_cost;
int i_part_cost_bi = 0;
int stride[2] = {8,8};
- uint8_t *src[2];
+ pixel *src[2];
x264_me_t m;
m.i_pixel = PIXEL_8x16;
LOAD_FENC( &m, h->mb.pic.p_fenc, 8*i, 0 );
//scale full resolution frame
if( h->sh.weight[j][0].weightfn && h->param.i_threads == 1 )
{
- uint8_t *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
- uint8_t *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+ pixel *src = h->fref0[j]->filtered[0] - h->fref0[j]->i_stride[0]*i_padv - PADH;
+ pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
int stride = h->fenc->i_stride[0];
int width = h->fenc->i_width[0] + PADH*2;
int height = h->fenc->i_lines[0] + i_padv*2;
{
memcpy( h->intra_border_backup[j][i],
h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
- h->sps->i_mb_width*16 >> !!i );
+ (h->sps->i_mb_width*16 >> !!i) * sizeof(pixel) );
}
if( b_deblock )
void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
{
int nz;
- uint8_t *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
- uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
+ pixel *p_src = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[idx]];
+ pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_16( int16_t, dct4x4,[16] );
if( h->mb.b_lossless )
int y = idx>>1;
int s8 = X264_SCAN8_0 + 2*x + 16*y;
int nz;
- uint8_t *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
- uint8_t *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
+ pixel *p_src = &h->mb.pic.p_fenc[0][8*x + 8*y*FENC_STRIDE];
+ pixel *p_dst = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_16( int16_t, dct8x8,[64] );
if( h->mb.b_lossless )
static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
{
- uint8_t *p_src = h->mb.pic.p_fenc[0];
- uint8_t *p_dst = h->mb.pic.p_fdec[0];
+ pixel *p_src = h->mb.pic.p_fenc[0];
+ pixel *p_dst = h->mb.pic.p_fdec[0];
ALIGNED_ARRAY_16( int16_t, dct4x4,[16],[16] );
ALIGNED_ARRAY_16( int16_t, dct_dc4x4,[16] );
for( int ch = 0; ch < 2; ch++ )
{
- uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
- uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
+ pixel *p_src = h->mb.pic.p_fenc[1+ch];
+ pixel *p_dst = h->mb.pic.p_fdec[1+ch];
int i_decimate_score = 0;
int nz_ac = 0;
}
}
-void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode )
+void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode )
{
int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
- uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
+ pixel *p_src = h->mb.pic.p_fenc_plane[0] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
if( i_mode == I_PRED_4x4_V )
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
h->predict_4x4[i_mode]( p_dst );
}
-void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] )
+void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] )
{
int stride = h->fenc->i_stride[0] << h->mb.b_interlaced;
- uint8_t *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
+ pixel *p_src = h->mb.pic.p_fenc_plane[0] + (idx&1)*8 + (idx>>1)*8*stride;
if( i_mode == I_PRED_8x8_V )
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
}
else if( h->mb.i_type == I_8x8 )
{
- ALIGNED_ARRAY_16( uint8_t, edge,[33] );
+ ALIGNED_ARRAY_16( pixel, edge,[33] );
h->mb.b_transform_8x8 = 1;
/* If we already encoded 3 of the 4 i8x8 blocks, we don't have to do them again. */
if( h->mb.i_skip_intra )
}
for( int i = h->mb.i_skip_intra ? 3 : 0 ; i < 4; i++ )
{
- uint8_t *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
- int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
+ pixel *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
+ int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
h->predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
if( h->mb.b_lossless )
}
for( int i = h->mb.i_skip_intra ? 15 : 0 ; i < 16; i++ )
{
- uint8_t *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
- int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
+ pixel *p_dst = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i]];
+ int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[i]];
if( (h->mb.i_neighbour4[i] & (MB_TOPRIGHT|MB_TOP)) == MB_TOP )
/* emulate missing topright samples */
- M32( &p_dst[4-FDEC_STRIDE] ) = p_dst[3-FDEC_STRIDE] * 0x01010101U;
+ MPIXEL_X4( &p_dst[4-FDEC_STRIDE] ) = PIXEL_SPLAT_X4( p_dst[3-FDEC_STRIDE] );
if( h->mb.b_lossless )
x264_predict_lossless_4x4( h, p_dst, i, i_mode );
for( int ch = 0; ch < 2; ch++ )
{
- uint8_t *p_src = h->mb.pic.p_fenc[1+ch];
- uint8_t *p_dst = h->mb.pic.p_fdec[1+ch];
+ pixel *p_src = h->mb.pic.p_fenc[1+ch];
+ pixel *p_dst = h->mb.pic.p_fdec[1+ch];
if( !b_bidir )
{
int x = i8&1;
int y = i8>>1;
int s8 = X264_SCAN8_0 + 2*x + 16*y;
- uint8_t *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
- uint8_t *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
+ pixel *p_fenc = h->mb.pic.p_fenc[0] + 8*x + 8*y*FENC_STRIDE;
+ pixel *p_fdec = h->mb.pic.p_fdec[0] + 8*x + 8*y*FDEC_STRIDE;
int b_decimate = h->mb.b_dct_decimate;
int nnz8x8 = 0;
int nz;
void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
{
int i_qp = h->mb.i_qp;
- uint8_t *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
- uint8_t *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
+ pixel *p_fenc = &h->mb.pic.p_fenc[0][block_idx_xy_fenc[i4]];
+ pixel *p_fdec = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
int nz;
/* Don't need motion compensation as this function is only used in qpel-RD, which caches pixel data. */
x264_macroblock_probe_skip( h, 1 )
void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
-void x264_predict_lossless_4x4( x264_t *h, uint8_t *p_dst, int idx, int i_mode );
-void x264_predict_lossless_8x8( x264_t *h, uint8_t *p_dst, int idx, int i_mode, uint8_t edge[33] );
+void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int idx, int i_mode );
+void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int idx, int i_mode, pixel edge[33] );
void x264_predict_lossless_16x16( x264_t *h, int i_mode );
void x264_macroblock_encode ( x264_t *h );
#define COST_MV_HPEL( mx, my ) \
{ \
int stride2 = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
- uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
+ pixel *pix_base = p_fref_w + bmx + bmy*stride;\
h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
{\
- uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
+ pixel *pix_base = p_fref_w + bmx + bmy*stride;\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
- uint8_t *pix_base = p_fref_w + omx + omy*stride;\
+ pixel *pix_base = p_fref_w + omx + omy*stride;\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
int bmx, bmy, bcost;
int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
int omx, omy, pmx, pmy;
- uint8_t *p_fenc = m->p_fenc[0];
- uint8_t *p_fref_w = m->p_fref_w;
- ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
+ pixel *p_fenc = m->p_fenc[0];
+ pixel *p_fref_w = m->p_fref_w;
+ ALIGNED_ARRAY_16( pixel, pix,[16*16] );
int costs[16];
else
{
int dir = 0;
- uint8_t *pix_base = p_fref_w + omx + (omy-4*i)*stride;
+ pixel *pix_base = p_fref_w + omx + (omy-4*i)*stride;
int dy = i*stride;
#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
uint16_t *sums_base = m->integral;
/* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
* this is not a problem because it is not used for any SSE instructions. */
- ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+ ALIGNED_16( static pixel zero[8*FENC_STRIDE] );
ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
for( i = 0; i < xn-2; i += 3 )
{
- uint8_t *ref = p_fref_w+min_x+my*stride;
+ pixel *ref = p_fref_w+min_x+my*stride;
int sads[3];
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( int j = 0; j < 3; j++ )
#define COST_MV_SAD( mx, my ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
if( b_refine_qpel || (dir^1) != odir ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- ALIGNED_ARRAY_16( uint8_t, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
int bmx = m->mv[0];
int bmy = m->mv[1];
int omx = bmx, omy = bmy;
int costs[4];
int stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
- uint8_t *src0, *src1, *src2, *src3;
+ pixel *src0, *src1, *src2, *src3;
src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
src1 = src0 + stride;
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
- ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
- uint8_t *src[2][9];
- uint8_t *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
- uint8_t *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
- uint8_t *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
+ ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_8( pixel, pixu_buf,[2],[9][8*8] );
+ ALIGNED_ARRAY_8( pixel, pixv_buf,[2],[9][8*8] );
+ pixel *src[2][9];
+ pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
+ pixel *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
+ pixel *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
int ref0 = h->mb.cache.ref[0][s8];
int ref1 = h->mb.cache.ref[1][s8];
const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
int i8 = i4>>2;
uint16_t amvd;
- uint8_t *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
- uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ pixel *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
+ pixel *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ pixel *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
h->mb.b_skip_mc = 1;
int i_ref;
const x264_weight_t *weight;
- uint8_t *p_fref[6];
- uint8_t *p_fref_w;
- uint8_t *p_fenc[3];
+ pixel *p_fref[6];
+ pixel *p_fref_w;
+ pixel *p_fenc[3];
uint16_t *integral;
int i_stride[2];
#define COPY_CABAC_PART( pos, size )\
memcpy( &cb->state[pos], &h->cabac.state[pos], size )
-static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
+static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
{
static const uint8_t hadamard_shift_x[4] = {4, 4, 3, 3};
static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
static const uint8_t hadamard_offset[4] = {0, 1, 3, 5};
- int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
- + hadamard_offset[pixel];
+ int cache_index = (x >> hadamard_shift_x[size]) + (y >> hadamard_shift_y[size])
+ + hadamard_offset[size];
uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
if( res )
return res - 1;
else
{
- uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
- res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
+ pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
+ res = h->pixf.hadamard_ac[size]( fenc, FENC_STRIDE );
h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
return res;
}
}
-static ALWAYS_INLINE int cached_satd( x264_t *h, int pixel, int x, int y )
+static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
{
static const uint8_t satd_shift_x[3] = {3, 2, 2};
static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
static const uint8_t satd_offset[3] = {0, 8, 16};
- ALIGNED_16( static uint8_t zero[16] );
- int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
- + satd_offset[pixel - PIXEL_8x4];
+ ALIGNED_16( static pixel zero[16] );
+ int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
+ + satd_offset[size - PIXEL_8x4];
int res = h->mb.pic.fenc_satd_cache[cache_index];
if( res )
return res - 1;
else
{
- uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
- int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
- res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
+ pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
+ int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
+ res = h->pixf.satd[size]( fenc, FENC_STRIDE, zero, 0 ) - dc;
h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
return res;
}
static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
{
- ALIGNED_16(static uint8_t zero[16]);
+ ALIGNED_16(static pixel zero[16]);
int satd = 0;
- uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
- uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
+ pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
+ pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
if( p == 0 && h->mb.i_psy_rd )
{
/* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
w->i_scale = X264_MIN( w->i_scale, 127 );
}
-static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
+static NOINLINE pixel *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dest )
{
int ref0_distance = fenc->i_frame - ref->i_frame - 1;
/* Note: this will never run during lookahead as weights_analyse is only called if no
int i_lines = fenc->i_lines_lowres;
int i_width = fenc->i_width_lowres;
int i_mb_xy = 0;
- uint8_t *p = dest;
+ pixel *p = dest;
for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
return ref->lowres[0];
}
-static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w )
+static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, pixel *src, x264_weight_t *w )
{
unsigned int cost = 0;
int i_stride = fenc->i_stride_lowres;
int i_lines = fenc->i_lines_lowres;
int i_width = fenc->i_width_lowres;
- uint8_t *fenc_plane = fenc->lowres[0];
- ALIGNED_ARRAY_8( uint8_t, buf,[8*8] );
+ pixel *fenc_plane = fenc->lowres[0];
+ ALIGNED_ARRAY_8( pixel, buf,[8*8] );
int pixoff = 0;
int i_mb = 0;
x264_lowres_context_init( h, &a );
x264_slicetype_frame_cost( h, &a, &fenc, 0, 0, 0, 0 );
}
- uint8_t *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
+ pixel *mcbuf = x264_weight_cost_init_luma( h, fenc, ref, h->mb.p_weight_buf[0] );
origscore = minscore = x264_weight_cost( h, fenc, mcbuf, 0 );
if( !minscore )
if( weights[0].weightfn && b_lookahead )
{
//scale lowres in lookahead for slicetype_frame_cost
- uint8_t *src = ref->buffer_lowres[0];
- uint8_t *dst = h->mb.p_weight_buf[0];
+ pixel *src = ref->buffer_lowres[0];
+ pixel *dst = h->mb.p_weight_buf[0];
int width = ref->i_width_lowres + PADH*2;
int height = ref->i_lines_lowres + PADV*2;
x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
i_mb_y > 0 && i_mb_y < h->sps->i_mb_height - 1) ||
h->sps->i_mb_width <= 2 || h->sps->i_mb_height <= 2;
- ALIGNED_ARRAY_8( uint8_t, pix1,[9*FDEC_STRIDE] );
- uint8_t *pix2 = pix1+8;
+ ALIGNED_ARRAY_8( pixel, pix1,[9*FDEC_STRIDE] );
+ pixel *pix2 = pix1+8;
x264_me_t m[2];
int i_bcost = COST_MAX;
int list_used = 0;
{ \
int hpel_idx1 = (((mv0)[0]&2)>>1) + ((mv0)[1]&2); \
int hpel_idx2 = (((mv1)[0]&2)>>1) + ((mv1)[1]&2); \
- uint8_t *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
- uint8_t *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
+ pixel *src1 = m[0].p_fref[hpel_idx1] + ((mv0)[0]>>2) + ((mv0)[1]>>2) * m[0].i_stride[0]; \
+ pixel *src2 = m[1].p_fref[hpel_idx2] + ((mv1)[0]>>2) + ((mv1)[1]>>2) * m[1].i_stride[0]; \
h->mc.avg[PIXEL_8x8]( pix1, 16, src1, m[0].i_stride[0], src2, m[1].i_stride[0], i_bipred_weight ); \
} \
else \
{ \
int stride1 = 16, stride2 = 16; \
- uint8_t *src1, *src2; \
+ pixel *src1, *src2; \
src1 = h->mc.get_ref( pix1, &stride1, m[0].p_fref, m[0].i_stride[0], \
(mv0)[0], (mv0)[1], 8, 8, w ); \
src2 = h->mc.get_ref( pix2, &stride2, m[1].p_fref, m[1].i_stride[0], \
lowres_intra_mb:
if( !fenc->b_intra_calculated )
{
- ALIGNED_ARRAY_16( uint8_t, edge,[33] );
- uint8_t *pix = &pix1[8+FDEC_STRIDE - 1];
- uint8_t *src = &fenc->lowres[0][i_pel_offset - 1];
+ ALIGNED_ARRAY_16( pixel, edge,[33] );
+ pixel *pix = &pix1[8+FDEC_STRIDE - 1];
+ pixel *src = &fenc->lowres[0][i_pel_offset - 1];
const int intra_penalty = 5;
int satds[3];
- memcpy( pix-FDEC_STRIDE, src-i_stride, 17 );
+ memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) );
for( int i = 0; i < 8; i++ )
pix[i*FDEC_STRIDE] = src[i*i_stride];
pix++;
#endif
/* buf1, buf2: initialised to random data and shouldn't write into them */
-uint8_t * buf1, * buf2;
+uint8_t *buf1, *buf2;
/* buf3, buf4: used to store output */
-uint8_t * buf3, * buf4;
+uint8_t *buf3, *buf4;
+/* pbuf*: point to the same memory as above, just for type convenience */
+pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
int quiet = 0;
x264_predict_t predict_4x4[9+3];
x264_predict8x8_t predict_8x8[9+3];
x264_predict_8x8_filter_t predict_8x8_filter;
- ALIGNED_16( uint8_t edge[33] );
+ ALIGNED_16( pixel edge[33] );
uint16_t cost_mv[32];
int ret = 0, ok, used_asm;
x264_predict_8x8c_init( 0, predict_8x8c );
x264_predict_8x8_init( 0, predict_8x8, &predict_8x8_filter );
x264_predict_4x4_init( 0, predict_4x4 );
- predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+ predict_8x8_filter( pbuf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
// maximize sum
for( int i = 0; i < 256; i++ )
used_asm = 1; \
for( int j = 0; j < 64; j++ ) \
{ \
- res_c = call_c( pixel_c.name[i], buf1, 16, buf2+j*!align, 64 ); \
- res_asm = call_a( pixel_asm.name[i], buf1, 16, buf2+j*!align, 64 ); \
+ res_c = call_c( pixel_c.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
+ res_asm = call_a( pixel_asm.name[i], pbuf1, 16, pbuf2+j*!align, 64 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
} \
for( int j = 0; j < 0x1000 && ok; j += 256 ) \
{ \
- res_c = pixel_c .name[i]( buf3+j, 16, buf4+j, 16 ); \
- res_asm = pixel_asm.name[i]( buf3+j, 16, buf4+j, 16 ); \
+ res_c = pixel_c .name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
+ res_asm = pixel_asm.name[i]( pbuf3+j, 16, pbuf4+j, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
used_asm = 1; \
for( int j = 0; j < 64; j++ ) \
{ \
- uint8_t *pix2 = buf2+j; \
- res_c[0] = pixel_c.sad[i]( buf1, 16, pix2, 64 ); \
- res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+6, 64 ); \
- res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1, 64 ); \
+ pixel *pix2 = pbuf2+j; \
+ res_c[0] = pixel_c.sad[i]( pbuf1, 16, pix2, 64 ); \
+ res_c[1] = pixel_c.sad[i]( pbuf1, 16, pix2+6, 64 ); \
+ res_c[2] = pixel_c.sad[i]( pbuf1, 16, pix2+1, 64 ); \
if( N == 4 ) \
{ \
- res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+10, 64 ); \
- call_a( pixel_asm.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
+ res_c[3] = pixel_c.sad[i]( pbuf1, 16, pix2+10, 64 ); \
+ call_a( pixel_asm.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
} \
else \
- call_a( pixel_asm.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
+ call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
} \
if( N == 4 ) \
- call_c2( pixel_c.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
+ call_c2( pixel_c.sad_x4[i], pbuf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
else \
- call_c2( pixel_c.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
+ call_c2( pixel_c.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
} \
} \
} \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
/* abi-check wrapper can't return uint64_t, so separate it from return value check */ \
- call_c1( pixel_c.var[i], buf1, 16 ); \
- call_a1( pixel_asm.var[i], buf1, 16 ); \
- uint64_t res_c = pixel_c.var[i]( buf1, 16 ); \
- uint64_t res_asm = pixel_asm.var[i]( buf1, 16 ); \
+ call_c1( pixel_c.var[i], pbuf1, 16 ); \
+ call_a1( pixel_asm.var[i], pbuf1, 16 ); \
+ uint64_t res_c = pixel_c.var[i]( pbuf1, 16 ); \
+ uint64_t res_asm = pixel_asm.var[i]( pbuf1, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
fprintf( stderr, "var[%d]: %d %d != %d %d [FAILED]\n", i, (int)res_c, (int)(res_c>>32), (int)res_asm, (int)(res_asm>>32) ); \
} \
- call_c2( pixel_c.var[i], buf1, 16 ); \
- call_a2( pixel_asm.var[i], buf1, 16 ); \
+ call_c2( pixel_c.var[i], pbuf1, 16 ); \
+ call_a2( pixel_asm.var[i], pbuf1, 16 ); \
}
ok = 1; used_asm = 0;
int res_c, res_asm, ssd_c, ssd_asm;
set_func_name( "var2_8x8" );
used_asm = 1;
- res_c = call_c( pixel_c.var2_8x8, buf1, 16, buf2, 16, &ssd_c );
- res_asm = call_a( pixel_asm.var2_8x8, buf1, 16, buf2, 16, &ssd_asm );
+ res_c = call_c( pixel_c.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_c );
+ res_asm = call_a( pixel_asm.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_asm );
if( res_c != res_asm || ssd_c != ssd_asm )
{
ok = 0;
used_asm = 1;
for( int j = 0; j < 32; j++ )
{
- uint8_t *pix = (j&16 ? buf1 : buf3) + (j&15)*256;
- call_c1( pixel_c.hadamard_ac[i], buf1, 16 );
- call_a1( pixel_asm.hadamard_ac[i], buf1, 16 );
+ pixel *pix = (j&16 ? pbuf1 : pbuf3) + (j&15)*256;
+ call_c1( pixel_c.hadamard_ac[i], pbuf1, 16 );
+ call_a1( pixel_asm.hadamard_ac[i], pbuf1, 16 );
uint64_t rc = pixel_c.hadamard_ac[i]( pix, 16 );
uint64_t ra = pixel_asm.hadamard_ac[i]( pix, 16 );
if( rc != ra )
break;
}
}
- call_c2( pixel_c.hadamard_ac[i], buf1, 16 );
- call_a2( pixel_asm.hadamard_ac[i], buf1, 16 );
+ call_c2( pixel_c.hadamard_ac[i], pbuf1, 16 );
+ call_a2( pixel_asm.hadamard_ac[i], pbuf1, 16 );
}
report( "pixel hadamard_ac :" );
int res_c[3], res_asm[3]; \
set_func_name( #name ); \
used_asm = 1; \
- memcpy( buf3, buf2, 1024 ); \
+ memcpy( buf3, buf2, 1024 * sizeof(pixel) ); \
for( int i = 0; i < 3; i++ ) \
{ \
- pred[i]( buf3+48, ##__VA_ARGS__ ); \
- res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \
+ pred[i]( pbuf3+48, ##__VA_ARGS__ ); \
+ res_c[i] = pixel_c.satd( pbuf1+48, 16, pbuf3+48, 32 ); \
} \
- call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \
+ call_a( pixel_asm.name, pbuf1+48, i8x8 ? edge : pbuf3+48, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
ALIGNED_16( int sums[5][4] ) = {{0}};
used_asm = ok = 1;
x264_emms();
- res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
- res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28, buf3 );
+ res_c = x264_pixel_ssim_wxh( &pixel_c, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3 );
+ res_a = x264_pixel_ssim_wxh( &pixel_asm, pbuf1+2, 32, pbuf2+2, 32, 32, 28, pbuf3 );
if( fabs( res_c - res_a ) > 1e-6 )
{
ok = 0;
fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
}
set_func_name( "ssim_core" );
- call_c2( pixel_c.ssim_4x4x2_core, buf1+2, 32, buf2+2, 32, sums );
- call_a2( pixel_asm.ssim_4x4x2_core, buf1+2, 32, buf2+2, 32, sums );
+ call_c2( pixel_c.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums );
+ call_a2( pixel_asm.ssim_4x4x2_core, pbuf1+2, 32, pbuf2+2, 32, sums );
set_func_name( "ssim_end" );
call_c2( pixel_c.ssim_end4, sums, sums, 4 );
call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
{ \
set_func_name( #name ); \
used_asm = 1; \
- call_c( dct_c.name, t1, buf1, buf2 ); \
- call_a( dct_asm.name, t2, buf1, buf2 ); \
+ call_c( dct_c.name, t1, pbuf1, pbuf2 ); \
+ call_a( dct_asm.name, t2, pbuf1, pbuf2 ); \
if( memcmp( t1, t2, size ) ) \
{ \
ok = 0; \
// fdct and idct are denormalized by different factors, so quant/dequant
// is needed to force the coefs into the right range.
- dct_c.sub16x16_dct( dct4, buf1, buf2 );
- dct_c.sub16x16_dct8( dct8, buf1, buf2 );
+ dct_c.sub16x16_dct( dct4, pbuf1, pbuf2 );
+ dct_c.sub16x16_dct8( dct8, pbuf1, pbuf2 );
for( int i = 0; i < 16; i++ )
{
qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
{ \
set_func_name( #name ); \
used_asm = 1; \
- memcpy( buf3, buf1, 32*32 ); \
- memcpy( buf4, buf1, 32*32 ); \
- memcpy( dct1, src, 512 ); \
- memcpy( dct2, src, 512 ); \
- call_c1( dct_c.name, buf3, (void*)dct1 ); \
- call_a1( dct_asm.name, buf4, (void*)dct2 ); \
- if( memcmp( buf3, buf4, 32*32 ) ) \
+ memcpy( buf3, buf1, 32*32 * sizeof(pixel) ); \
+ memcpy( buf4, buf1, 32*32 * sizeof(pixel) ); \
+ memcpy( dct1, src, 512 * sizeof(pixel) ); \
+ memcpy( dct2, src, 512 * sizeof(pixel) ); \
+ call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
+ call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
+ if( memcmp( buf3, buf4, 32*32 * sizeof(pixel) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
} \
- call_c2( dct_c.name, buf3, (void*)dct1 ); \
- call_a2( dct_asm.name, buf4, (void*)dct2 ); \
+ call_c2( dct_c.name, pbuf3, (void*)dct1 ); \
+ call_a2( dct_asm.name, pbuf4, (void*)dct2 ); \
}
ok = 1; used_asm = 0;
TEST_IDCT( add4x4_idct, dct4 );
int nz_a, nz_c; \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
used_asm = 1; \
- memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
- memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
- nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
- nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
+ memcpy( buf3, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
+ memcpy( buf4, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
+ nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \
+ nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \
if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
} \
- call_c2( zigzag_c.name, t1, buf2, buf3 ); \
- call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
+ call_c2( zigzag_c.name, t1, pbuf2, pbuf3 ); \
+ call_a2( zigzag_asm.name, t2, pbuf2, pbuf4 ); \
}
#define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
used_asm = 1; \
for( int i = 0; i < 2; i++ ) \
{ \
- memcpy( buf3, buf2, 16*FDEC_STRIDE ); \
- memcpy( buf4, buf2, 16*FDEC_STRIDE ); \
+ memcpy( buf3, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
+ memcpy( buf4, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
for( int j = 0; j < 4; j++ ) \
{ \
- memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
- memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
+ memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
+ memcpy( pbuf4 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
} \
- nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
- nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
- if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
+ nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \
+ nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \
+ if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
break; \
} \
} \
- call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
- call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
+ call_c2( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \
+ call_a2( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \
}
#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
memcpy(dct, buf1, size*sizeof(int16_t)); \
for( int i = 0; i < size; i++ ) \
dct[i] = rand()&0x1F ? 0 : dct[i]; \
- memcpy(buf3, buf4, 10*sizeof(uint8_t)); \
+ memcpy(buf3, buf4, 10); \
call_c( zigzag_c.name, t1, dct, buf3 ); \
call_a( zigzag_asm.name, t2, dct, buf4 ); \
- if( memcmp( t1, t2, size*sizeof(int16_t) ) || memcmp( buf3, buf4, 10*sizeof(uint8_t) ) ) \
+ if( memcmp( t1, t2, size*sizeof(int16_t) ) || memcmp( buf3, buf4, 10 ) ) \
{ \
ok = 0; \
} \
x264_mc_functions_t mc_c;
x264_mc_functions_t mc_ref;
x264_mc_functions_t mc_a;
- x264_pixel_function_t pixel;
+ x264_pixel_function_t pixf;
- uint8_t *src = &buf1[2*64+2];
- uint8_t *src2[4] = { &buf1[3*64+2], &buf1[5*64+2],
- &buf1[7*64+2], &buf1[9*64+2] };
- uint8_t *dst1 = buf3;
- uint8_t *dst2 = buf4;
+ pixel *src = &(pbuf1)[2*64+2];
+ pixel *src2[4] = { &(pbuf1)[3*64+2], &(pbuf1)[5*64+2],
+ &(pbuf1)[7*64+2], &(pbuf1)[9*64+2] };
+ pixel *dst1 = pbuf3;
+ pixel *dst2 = pbuf4;
int ret = 0, ok, used_asm;
x264_mc_init( 0, &mc_c );
x264_mc_init( cpu_ref, &mc_ref );
x264_mc_init( cpu_new, &mc_a );
- x264_pixel_init( 0, &pixel );
+ x264_pixel_init( 0, &pixf );
#define MC_TEST_LUMA( w, h ) \
if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
} \
if( mc_a.get_ref != mc_ref.get_ref ) \
{ \
- uint8_t *ref = dst2; \
+ pixel *ref = dst2; \
int ref_stride = 32; \
const x264_weight_t *weight = weight_none; \
set_func_name( "get_ref_%dx%d", w, h ); \
memset( buf3, 0xCD, 1024 ); \
memset( buf4, 0xCD, 1024 ); \
call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
- ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
+ ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
for( int i = 0; i < h; i++ ) \
- if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
+ if( memcmp( dst1+i*32, ref+i*ref_stride, w * sizeof(pixel) ) ) \
{ \
fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
ok = 0; \
ok = 1, used_asm = 0; \
for( int i = 0; i < 10; i++ ) \
{ \
- memcpy( buf3, buf1+320, 320 ); \
- memcpy( buf4, buf1+320, 320 ); \
+ memcpy( buf3, pbuf1+320, 320 * sizeof(pixel) ); \
+ memcpy( buf4, pbuf1+320, 320 * sizeof(pixel) ); \
if( mc_a.name[i] != mc_ref.name[i] ) \
{ \
set_func_name( "%s_%s", #name, pixel_names[i] ); \
used_asm = 1; \
- call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
- call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
- if( memcmp( buf3, buf4, 320 ) ) \
+ call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+ call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+ if( memcmp( buf3, buf4, 320 * sizeof(pixel) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
} \
- call_c2( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
- call_a2( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
+ call_c2( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
+ call_a2( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
} \
} \
}
ok = 1, used_asm = 0; \
for( int i = 1; i <= 5; i++ ) \
{ \
- ALIGNED_16( uint8_t buffC[640] ); \
- ALIGNED_16( uint8_t buffA[640] ); \
+ ALIGNED_16( pixel buffC[640] ); \
+ ALIGNED_16( pixel buffA[640] ); \
int j = X264_MAX( i*4, 2 ); \
- memset( buffC, 0, 640 ); \
- memset( buffA, 0, 640 ); \
+ memset( buffC, 0, 640 * sizeof(pixel) ); \
+ memset( buffA, 0, 640 * sizeof(pixel) ); \
x264_t ha; \
ha.mc = mc_a; \
/* w12 is the same as w16 in some cases */ \
{ \
set_func_name( "%s_w%d", #name, j ); \
used_asm = 1; \
- call_c1( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
+ call_c1( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
mc_a.weight_cache(&ha, &weight); \
- call_a1( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+ call_a1( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
for( int k = 0; k < 16; k++ ) \
- if( memcmp( &buffC[k*32], &buffA[k*32], j ) ) \
+ if( memcmp( &buffC[k*32], &buffA[k*32], j * sizeof(pixel) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "[%d]: [FAILED] s:%d o:%d d%d\n", i, s, o, d ); \
break; \
} \
- call_c2( mc_c.weight[i], buffC, 32, buf2+align_off, 32, &weight, 16 ); \
- call_a2( weight.weightfn[i], buffA, 32, buf2+align_off, 32, &weight, 16 ); \
+ call_c2( mc_c.weight[i], buffC, 32, pbuf2+align_off, 32, &weight, 16 ); \
+ call_a2( weight.weightfn[i], buffA, 32, pbuf2+align_off, 32, &weight, 16 ); \
} \
}
if( mc_a.hpel_filter != mc_ref.hpel_filter )
{
- uint8_t *srchpel = buf1+8+2*64;
- uint8_t *dstc[3] = { buf3+8, buf3+8+16*64, buf3+8+32*64 };
- uint8_t *dsta[3] = { buf4+8, buf4+8+16*64, buf4+8+32*64 };
- void *tmp = buf3+49*64;
+ pixel *srchpel = pbuf1+8+2*64;
+ pixel *dstc[3] = { pbuf3+8, pbuf3+8+16*64, pbuf3+8+32*64 };
+ pixel *dsta[3] = { pbuf4+8, pbuf4+8+16*64, pbuf4+8+32*64 };
+ void *tmp = pbuf3+49*64;
set_func_name( "hpel_filter" );
ok = 1; used_asm = 1;
- memset( buf3, 0, 4096 );
- memset( buf4, 0, 4096 );
+ memset( buf3, 0, 4096 * sizeof(pixel) );
+ memset( buf4, 0, 4096 * sizeof(pixel) );
call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp );
call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp );
for( int i = 0; i < 3; i++ )
for( int j = 0; j < 10; j++ )
//FIXME ideally the first pixels would match too, but they aren't actually used
- if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 ) )
+ if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 * sizeof(pixel) ) )
{
ok = 0;
fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
{
- uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
- uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf4+3072 };
+ pixel *dstc[4] = { pbuf3, pbuf3+1024, pbuf3+2048, pbuf3+3072 };
+ pixel *dsta[4] = { pbuf4, pbuf4+1024, pbuf4+2048, pbuf4+3072 };
set_func_name( "lowres_init" );
ok = 1; used_asm = 1;
for( int w = 40; w <= 48; w += 8 )
{
int stride = (w+8)&~15;
- call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
- call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
+ call_c( mc_c.frame_init_lowres_core, pbuf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
+ call_a( mc_a.frame_init_lowres_core, pbuf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
for( int i = 0; i < 16; i++ )
{
for( int j = 0; j < 4; j++ )
- if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
+ if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w * sizeof(pixel) ) )
{
ok = 0;
fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
int stride = 80;\
set_func_name( #name );\
used_asm = 1;\
- memcpy( buf3, buf1, size*2*stride );\
- memcpy( buf4, buf1, size*2*stride );\
+ memcpy( buf3, buf1, size*2*stride * sizeof(pixel) );\
+ memcpy( buf4, buf1, size*2*stride * sizeof(pixel) );\
uint16_t *sum = (uint16_t*)buf3;\
call_c1( mc_c.name, __VA_ARGS__ );\
sum = (uint16_t*)buf4;\
call_a1( mc_a.name, __VA_ARGS__ );\
- if( memcmp( buf3, buf4, (stride-8)*2 )\
- || (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
+ if( memcmp( buf3, buf4, (stride-8)*2 * sizeof(pixel) )\
+ || (size>9 && memcmp( pbuf3+18*stride, pbuf4+18*stride, (stride-8)*2 * sizeof(pixel) )))\
ok = 0;\
call_c2( mc_c.name, __VA_ARGS__ );\
call_a2( mc_a.name, __VA_ARGS__ );\
}
ok = 1; used_asm = 0;
- INTEGRAL_INIT( integral_init4h, 2, sum+stride, buf2, stride );
- INTEGRAL_INIT( integral_init8h, 2, sum+stride, buf2, stride );
+ INTEGRAL_INIT( integral_init4h, 2, sum+stride, pbuf2, stride );
+ INTEGRAL_INIT( integral_init8h, 2, sum+stride, pbuf2, stride );
INTEGRAL_INIT( integral_init4v, 14, sum, sum+9*stride, stride );
INTEGRAL_INIT( integral_init8v, 9, sum, stride );
report( "integral init :" );
for( int j = 0; j < 1024; j++ ) \
/* two distributions of random to excersize different failure modes */ \
buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
- memcpy( buf4, buf3, 1024 ); \
+ memcpy( buf4, buf3, 1024 * sizeof(pixel) ); \
if( db_a.name != db_ref.name ) \
{ \
set_func_name( #name ); \
used_asm = 1; \
- call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
- call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
- if( memcmp( buf3, buf4, 1024 ) ) \
+ call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ if( memcmp( buf3, buf4, 1024 * sizeof(pixel) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
break; \
} \
- call_c2( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
- call_a2( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_c2( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
+ call_a2( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
} \
}
static int check_intra( int cpu_ref, int cpu_new )
{
int ret = 0, ok = 1, used_asm = 0;
- ALIGNED_16( uint8_t edge[33] );
- ALIGNED_16( uint8_t edge2[33] );
+ ALIGNED_16( pixel edge[33] );
+ ALIGNED_16( pixel edge2[33] );
struct
{
x264_predict_t predict_16x16[4+3];
x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
- ip_c.predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+ ip_c.predict_8x8_filter( pbuf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
#define INTRA_TEST( name, dir, w, ... )\
if( ip_a.name[dir] != ip_ref.name[dir] )\
{\
set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
used_asm = 1;\
- memcpy( buf3, buf1, 32*20 );\
- memcpy( buf4, buf1, 32*20 );\
- call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\
- call_a( ip_a.name[dir], buf4+48, ##__VA_ARGS__ );\
- if( memcmp( buf3, buf4, 32*20 ) )\
+ memcpy( buf3, buf1, 32*20 * sizeof(pixel) );\
+ memcpy( buf4, buf1, 32*20 * sizeof(pixel) );\
+ call_c( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
+ call_a( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
+ if( memcmp( buf3, buf4, 32*20 * sizeof(pixel) ) )\
{\
fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\
ok = 0;\
used_asm = 1;
for( int i = 0; i < 32; i++ )
{
- memcpy( edge2, edge, 33 );
- call_c(ip_c.predict_8x8_filter, buf1+48, edge, (i&24)>>1, i&7);
- call_a(ip_a.predict_8x8_filter, buf1+48, edge2, (i&24)>>1, i&7);
- if( memcmp( edge, edge2, 33 ) )
+ memcpy( edge2, edge, 33 * sizeof(pixel) );
+ call_c(ip_c.predict_8x8_filter, pbuf1+48, edge, (i&24)>>1, i&7);
+ call_a(ip_a.predict_8x8_filter, pbuf1+48, edge2, (i&24)>>1, i&7);
+ if( memcmp( edge, edge2, 33 * sizeof(pixel) ) )
{
fprintf( stderr, "predict_8x8_filter : [FAILED] %d %d\n", (i&24)>>1, i&7);
ok = 0;
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
return -1;
}
- buf2 = buf1 + 0xf00;
- buf3 = buf2 + 0xf00;
- buf4 = buf3 + 0x1000;
+#define INIT_POINTER_OFFSETS\
+ buf2 = buf1 + 0xf00;\
+ buf3 = buf2 + 0xf00;\
+ buf4 = buf3 + 0x1000;\
+ pbuf1 = (pixel*)buf1;\
+ pbuf2 = (pixel*)buf2;\
+ pbuf3 = (pixel*)buf3;\
+ pbuf4 = (pixel*)buf4;
+ INIT_POINTER_OFFSETS;
for( int i = 0; i < 0x1e00; i++ )
buf1[i] = rand() & 0xFF;
memset( buf1+0x1e00, 0, 0x2000 );
if( do_bench )
for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
{
- buf2 = buf1 + 0xf00;
- buf3 = buf2 + 0xf00;
- buf4 = buf3 + 0x1000;
+ INIT_POINTER_OFFSETS;
ret |= x264_stack_pagealign( check_all_flags, i*16 );
buf1 += 16;
quiet = 1;