useless loads/stores and calculations of permutation vectors.
Affected functions are all of mc_luma, mc_chroma, 'get_ref', SATD, SA8D and deblock.
Gains globally vary from ~5% - 15% on a depending on settings running on a 1.42 ghz G4.
E: sennindemokrit AT gmx DOT net
D: x86 asm
+N: David Wolstencroft
+D: Altivec optimizations
+
N: Eric Petit
E: eric.petit AT lapsus DOT org
C: titer
*(dst_int+15*int_dst_stride) = *(src_int + 15);
}
-/** \brief performs a 6x16 transpose of data in src, and stores it to dst
- \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
- out of unaligned_load() */
+/** \brief performs a 6x16 transpose of data in src, and stores it to dst */
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\
- VEC_LOAD(src, r0, 16, vec_u8_t); \
- VEC_LOAD(src + src_stride, r1, 16, vec_u8_t); \
- VEC_LOAD(src + 2*src_stride, r2, 16, vec_u8_t); \
- VEC_LOAD(src + 3*src_stride, r3, 16, vec_u8_t); \
- VEC_LOAD(src + 4*src_stride, r4, 16, vec_u8_t); \
- VEC_LOAD(src + 5*src_stride, r5, 16, vec_u8_t); \
- VEC_LOAD(src + 6*src_stride, r6, 16, vec_u8_t); \
- VEC_LOAD(src + 7*src_stride, r7, 16, vec_u8_t); \
- VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t); \
- VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t); \
+ VEC_LOAD(src, r0, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + src_stride, r1, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 2*src_stride, r2, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 3*src_stride, r3, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 4*src_stride, r4, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 5*src_stride, r5, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 6*src_stride, r6, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 7*src_stride, r7, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t, pix ); \
\
- VEC_LOAD(src + 8*src_stride, r8, 16, vec_u8_t); \
- VEC_LOAD(src + 9*src_stride, r9, 16, vec_u8_t); \
- VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t); \
- VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t); \
- VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t); \
- VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t); \
+ VEC_LOAD(src + 8*src_stride, r8, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 9*src_stride, r9, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t, pix ); \
+ VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t, pix ); \
\
/*Merge first pairs*/ \
r0 = vec_mergeh(r0, r8); /*0, 8*/ \
if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
return;
PREP_LOAD;
+ vec_u8_t _pix_ = vec_lvsl(0, pix-3);
readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
transpose4x16(line1, line2, line3, line4);
typedef void (*pf_mc_t)( uint8_t *src, int i_src,
uint8_t *dst, int i_dst, int i_height );
+
+static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+
static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
{
return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
pix[ 3];
}
-/* pixel_avg */
-static inline void pixel_avg_w4( uint8_t *dst, int i_dst,
+
+static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
- uint8_t *src2, int i_src2,
- int i_height )
+ uint8_t *src2, int i_height )
{
int x, y;
for( y = 0; y < i_height; y++ )
}
dst += i_dst;
src1 += i_src1;
- src2 += i_src2;
+ src2 += i_src1;
}
}
-static inline void pixel_avg_w8( uint8_t *dst, int i_dst,
+
+static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
- uint8_t *src2, int i_src2,
- int i_height )
+ uint8_t *src2, int i_height )
{
int y;
vec_u8_t src1v, src2v;
- LOAD_ZERO;
PREP_LOAD;
PREP_STORE8;
+ PREP_LOAD_SRC( src1 );
+ PREP_LOAD_SRC( src2 );
+
for( y = 0; y < i_height; y++ )
{
- VEC_LOAD( src1, src1v, 8, vec_u8_t );
- VEC_LOAD( src2, src2v, 8, vec_u8_t );
+ VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
+ VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
src1v = vec_avg( src1v, src2v );
VEC_STORE8( src1v, dst );
dst += i_dst;
src1 += i_src1;
- src2 += i_src2;
+ src2 += i_src1;
}
}
-static inline void pixel_avg_w16( uint8_t *dst, int i_dst,
+
+static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
- uint8_t *src2, int i_src2,
- int i_height )
+ uint8_t *src2, int i_height )
{
int y;
vec_u8_t src1v, src2v;
PREP_LOAD;
- PREP_STORE16;
+ PREP_LOAD_SRC( src1 );
+ PREP_LOAD_SRC( src2 );
+
for( y = 0; y < i_height; y++ )
{
- VEC_LOAD( src1, src1v, 16, vec_u8_t );
- VEC_LOAD( src2, src2v, 16, vec_u8_t );
+ VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
+ VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
src1v = vec_avg( src1v, src2v );
- VEC_STORE16( src1v, dst );
+ vec_st(src1v, 0, dst);
dst += i_dst;
src1 += i_src1;
- src2 += i_src2;
+ src2 += i_src1;
}
}
+static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst,
+ uint8_t *src1, int i_src1,
+ uint8_t *src2, int i_height )
+{
+ x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
+ x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
+}
+
/* mc_copy: plain c */
+
#define MC_COPY( name, a ) \
-static void name( uint8_t *src, int i_src, \
- uint8_t *dst, int i_dst, int i_height ) \
+static void name( uint8_t *dst, int i_dst, \
+ uint8_t *src, int i_src, int i_height ) \
{ \
int y; \
for( y = 0; y < i_height; y++ ) \
dst += i_dst; \
} \
}
-MC_COPY( mc_copy_w4, 4 )
-MC_COPY( mc_copy_w8, 8 )
-MC_COPY( mc_copy_w16, 16 )
+MC_COPY( x264_mc_copy_w4_altivec, 4 )
+MC_COPY( x264_mc_copy_w8_altivec, 8 )
-void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
+static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst,
+ uint8_t *src, int i_src, int i_height )
+{
+ int y;
+ vec_u8_t cpyV;
+ PREP_LOAD;
+ PREP_LOAD_SRC( src );
+
+ for( y = 0; y < i_height; y++)
+ {
+ VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
+ vec_st(cpyV, 0, dst);
+
+ src += i_src;
+ dst += i_dst;
+ }
+}
+
+
+static void mc_luma_altivec( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
- uint8_t *src1, *src2;
-
- /* todo : fixme... */
- int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
-
- int hpel1x = mvx>>1;
- int hpel1y = (mvy+1-correction)>>1;
- int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
-
-
- src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
-
- if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
{
- int hpel2x = (mvx+1)>>1;
- int hpel2y = (mvy+correction)>>1;
- int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
-
- src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
switch(i_width) {
case 4:
- pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
+ x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
break;
case 8:
- pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
+ x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
break;
case 16:
default:
- pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
+ x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
}
-
+
}
else
{
switch(i_width) {
case 4:
- mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height );
+ x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
break;
case 8:
- mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height );
+ x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
break;
case 16:
- mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height );
+ x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
break;
}
-
}
}
-uint8_t *get_ref_altivec( uint8_t *dst, int * i_dst_stride,
+
+
+static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride,
uint8_t *src[4], int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
{
- uint8_t *src1, *src2;
-
- /* todo : fixme... */
- int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0;
-
- int hpel1x = mvx>>1;
- int hpel1y = (mvy+1-correction)>>1;
- int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 );
-
-
- src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1);
-
- if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
{
- int hpel2x = (mvx+1)>>1;
- int hpel2y = (mvy+correction)>>1;
- int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 );
-
- src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1);
-
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
switch(i_width) {
case 4:
- pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
+ x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
break;
case 8:
- pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
+ x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
break;
case 12:
case 16:
default:
- pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
+ x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
break;
case 20:
- //FIXME suboptimal
- pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
- src2, i_src_stride, i_height );
- pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
- src2+16, i_src_stride, i_height );
+ x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
break;
}
return dst;
-
}
else
{
LOAD_ZERO;
PREP_LOAD;
+ PREP_LOAD_SRC( src );
+ PREP_LOAD_SRC( srcp );
PREP_STORE4;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
permv = vec_lvsl( 0, (uint8_t *) 1 );
shiftv = vec_splat_u16( 6 );
- VEC_LOAD( src, src2v_8, 5, vec_u8_t );
+ VEC_LOAD( src, src2v_8, 5, vec_u8_t, src );
src3v_8 = vec_perm( src2v_8, src2v_8, permv );
for( y = 0; y < i_height; y++ )
{
src0v_8 = src2v_8;
src1v_8 = src3v_8;
- VEC_LOAD( srcp, src2v_8, 5, vec_u8_t );
+ VEC_LOAD( srcp, src2v_8, 5, vec_u8_t, srcp );
src3v_8 = vec_perm( src2v_8, src2v_8, permv );
dstv_16 = k32v;
LOAD_ZERO;
PREP_LOAD;
+ PREP_LOAD_SRC( src );
+ PREP_LOAD_SRC( srcp );
PREP_STORE8;
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
permv = vec_lvsl( 0, (uint8_t *) 1 );
shiftv = vec_splat_u16( 6 );
- VEC_LOAD( src, src2v_8, 9, vec_u8_t );
+ VEC_LOAD( src, src2v_8, 9, vec_u8_t, src);
src3v_8 = vec_perm( src2v_8, src2v_8, permv );
for( y = 0; y < i_height; y++ )
{
src0v_8 = src2v_8;
src1v_8 = src3v_8;
- VEC_LOAD( srcp, src2v_8, 9, vec_u8_t );
+ VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, srcp );
src3v_8 = vec_perm( src2v_8, src2v_8, permv );
dstv_16 = k32v;
#define HPEL_FILTER_HORIZONTAL() \
{ \
- VEC_LOAD( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t ); \
- VEC_LOAD( &src[x+14+i_stride*y], src6v, 16, vec_u8_t ); \
+ VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
+ VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
\
src2v = vec_sld( src1v, src6v, 1 ); \
src3v = vec_sld( src1v, src6v, 2 ); \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dsth[x+i_stride*y] ); \
+ VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
}
#define HPEL_FILTER_VERTICAL() \
{ \
- VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); \
- VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); \
- VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); \
- VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); \
- VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); \
- VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); \
+ VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
+ VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
\
temp1v = vec_u8_to_s16_h( src1v ); \
temp2v = vec_u8_to_s16_h( src2v ); \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstv[x+i_stride*y] ); \
+ VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
}
#define HPEL_FILTER_CENTRAL() \
\
destv = vec_packsu( dest1v, dest2v ); \
\
- VEC_STORE16( destv, &dstc[x-16+i_stride*y] ); \
+ VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
}
void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
PREP_LOAD;
+ PREP_LOAD_SRC( src);
PREP_STORE16;
+ PREP_STORE16_DST( dsth );
LOAD_ZERO;
vec_u16_t twov, fourv, fivev, sixv;
}
/* Partial vertical filter */
- VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t );
- VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t );
- VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t );
- VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t );
- VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t );
- VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t );
+ VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
+ VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
+ VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
+ VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
+ VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
+ VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
temp1v = vec_u8_to_s16_h( src1v );
temp2v = vec_u8_to_s16_h( src2v );
vec_s32_t sumv = zero_s32v; \
for( y = 0; y < ly; y++ ) \
{ \
- VEC_LOAD( pix1, pix1v, lx, vec_u8_t ); \
- VEC_LOAD( pix2, pix2v, lx, vec_u8_t ); \
+ VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t ); \
+ VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t ); \
sumv = (vec_s32_t) vec_sum4s( \
vec_sub( vec_max( pix1v, pix2v ), \
vec_min( pix1v, pix2v ) ), \
DECLARE_ALIGNED_16( int i_satd );
PREP_DIFF;
+ PREP_LOAD_SRC( pix1 );
vec_s16_t diff0v, diff1v, diff2v, diff3v;
vec_s16_t temp0v, temp1v, temp2v, temp3v;
vec_s32_t satdv;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
+ vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+ vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+
+
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
/* Hadamar H */
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
vec_s16_t temp0v, temp1v, temp2v, temp3v;
vec_s32_t satdv;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
+ PREP_LOAD_SRC( pix1 );
+ vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+ vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
VEC_ADD_ABS( temp2v, satdv, satdv );
VEC_ADD_ABS( temp3v, satdv, satdv );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+
+ PREP_LOAD_SRC( pix1 );
+ vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+ vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+ PREP_LOAD_SRC( pix1 );
+ vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+ vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
temp4v, temp5v, temp6v, temp7v;
vec_s32_t satdv;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+ PREP_LOAD_SRC( pix1 );
+ vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
+ vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
+
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
VEC_ADD_ABS( temp6v, satdv, satdv );
VEC_ADD_ABS( temp7v, satdv, satdv );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
temp0v, temp1v, temp2v, temp3v );
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
LOAD_ZERO;
PREP_LOAD;
+ PREP_LOAD_SRC( pix2 );
vec_s32_t satdv;
vec_s16_t pix1v, pix2v;
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
diffl4v, diffl5v, diffl6v, diffl7v;
vec_s16_t temp0v, temp1v, temp2v, temp3v,
temp4v, temp5v, temp6v, temp7v;
+ PREP_LOAD_SRC( pix2 );
+
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
int32_t i_satd=0;
PREP_DIFF;
+ PREP_LOAD_SRC( pix1 );
+ PREP_LOAD_SRC( pix2 );
vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
- VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
+ VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
vec_u8_t pix1v, pix2v;
vec_u32_t s1v, s2v, ssv, s12v;
PREP_LOAD;
+ PREP_LOAD_SRC (pix1);
+ PREP_LOAD_SRC (pix2);
LOAD_ZERO;
s1v = s2v = ssv = s12v = zero_u32v;
for(y=0; y<4; y++)
{
- VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t );
- VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t );
+ VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
+ VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
s1v = vec_sum4s( pix1v, s1v );
s2v = vec_sum4s( pix2v, s2v );
/***********************************************************************
* PREP_LOAD: declares two vectors required to perform unaligned loads
- * VEC_LOAD: loads n bytes from u8 * p into vector v of type t
+ * VEC_LOAD: loads n bytes from u8 * p into vector v of type t where o is from original src offset
+ * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known
+ * VEC_LOAD_OFFSET: as above, but with offset vector known in advance
**********************************************************************/
#define PREP_LOAD \
vec_u8_t _hv, _lv
-#define VEC_LOAD( p, v, n, t ) \
+#define PREP_LOAD_SRC( src ) \
+ vec_u8_t _##src##_ = vec_lvsl(0, src)
+
+#define VEC_LOAD_G( p, v, n, t ) \
_hv = vec_ld( 0, p ); \
v = (t) vec_lvsl( 0, p ); \
_lv = vec_ld( n - 1, p ); \
- v = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
+ v = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
+
+#define VEC_LOAD( p, v, n, t, g ) \
+ _hv = vec_ld( 0, p ); \
+ _lv = vec_ld( n - 1, p ); \
+ v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ )
+
+#define VEC_LOAD_OFFSET( p, v, n, t, o ) \
+ _hv = vec_ld( 0, p); \
+ _lv = vec_ld( n - 1, p ); \
+ v = (t) vec_perm( _hv, _lv, (vec_u8_t) o )
+
+#define VEC_LOAD_PARTIAL( p, v, n, t, g) \
+ _hv = vec_ld( 0, p); \
+ v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
+
/***********************************************************************
* PREP_STORE##n: declares required vectors to store n bytes to a
* VEC_STORE##n: stores n bytes from vector v to address p
**********************************************************************/
#define PREP_STORE16 \
- vec_u8_t _tmp1v, _tmp2v \
+ vec_u8_t _tmp1v\
-#define VEC_STORE16( v, p ) \
+#define PREP_STORE16_DST( dst ) \
+ vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \
+ vec_u8_t _##dst##r_ = vec_lvsr(0, dst);
+
+#define VEC_STORE16( v, p, o ) \
_hv = vec_ld( 0, p ); \
- _tmp2v = vec_lvsl( 0, p ); \
_lv = vec_ld( 15, p ); \
- _tmp1v = vec_perm( _lv, _hv, _tmp2v ); \
- _tmp2v = vec_lvsr( 0, p ); \
- _lv = vec_perm( (vec_u8_t) v, _tmp1v, _tmp2v ); \
+ _tmp1v = vec_perm( _lv, _hv, _##o##l_ ); \
+ _lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
vec_st( _lv, 15, (uint8_t *) p ); \
- _hv = vec_perm( _tmp1v, (vec_u8_t) v, _tmp2v ); \
- vec_st( _hv, 0, (uint8_t *) p )
+ _hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
+ vec_st( _hv, 0, (uint8_t *) p )
+
#define PREP_STORE8 \
- PREP_STORE16; \
- vec_u8_t _tmp3v, _tmp4v; \
- const vec_u8_t sel_h = \
- (vec_u8_t) CV(-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0)
-
-#define PREP_STORE8_HL \
- PREP_STORE8; \
- const vec_u8_t sel_l = \
- (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1)
-
-#define VEC_STORE8 VEC_STORE8_H
-
-#define VEC_STORE8_H( v, p ) \
- _tmp3v = vec_lvsr( 0, (uint8_t *) p ); \
- _tmp4v = vec_perm( (vec_u8_t) v, (vec_u8_t) v, _tmp3v ); \
- _lv = vec_ld( 7, (uint8_t *) p ); \
- _tmp1v = vec_perm( sel_h, zero_u8v, _tmp3v ); \
- _lv = vec_sel( _lv, _tmp4v, _tmp1v ); \
- vec_st( _lv, 7, (uint8_t *) p ); \
- _hv = vec_ld( 0, (uint8_t *) p ); \
- _tmp2v = vec_perm( zero_u8v, sel_h, _tmp3v ); \
- _hv = vec_sel( _hv, _tmp4v, _tmp2v ); \
- vec_st( _hv, 0, (uint8_t *) p )
-
-#define VEC_STORE8_L( v, p ) \
- _tmp3v = vec_lvsr( 8, (uint8_t *) p ); \
- _tmp4v = vec_perm( (vec_u8_t) v, (vec_u8_t) v, _tmp3v ); \
- _lv = vec_ld( 7, (uint8_t *) p ); \
- _tmp1v = vec_perm( sel_l, zero_u8v, _tmp3v ); \
- _lv = vec_sel( _lv, _tmp4v, _tmp1v ); \
- vec_st( _lv, 7, (uint8_t *) p ); \
- _hv = vec_ld( 0, (uint8_t *) p ); \
- _tmp2v = vec_perm( zero_u8v, sel_l, _tmp3v ); \
- _hv = vec_sel( _hv, _tmp4v, _tmp2v ); \
- vec_st( _hv, 0, (uint8_t *) p )
+ vec_u8_t _tmp3v \
+
+#define VEC_STORE8( v, p ) \
+ _tmp3v = vec_lvsl(0, p); \
+ v = vec_perm(v, v, _tmp3v); \
+ vec_ste((vec_u32_t)v,0,(uint32_t*)p); \
+ vec_ste((vec_u32_t)v,4,(uint32_t*)p)
+
#define PREP_STORE4 \
PREP_STORE16; \
- vec_u8_t _tmp3v; \
+ vec_u8_t _tmp2v, _tmp3v; \
const vec_u8_t sel = \
(vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0)
* d: s16v
*
* Loads n bytes from p1 and p2, do the diff of the high elements into
- * d, increments p1 and p2 by i1 and i2
+ * d, increments p1 and p2 by i1 and i2 into known offset g
**********************************************************************/
#define PREP_DIFF \
LOAD_ZERO; \
PREP_LOAD; \
vec_s16_t pix1v, pix2v;
-#define VEC_DIFF_H(p1,i1,p2,i2,n,d) \
- VEC_LOAD( p1, pix1v, n, vec_s16_t ); \
+
+#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g) \
+ VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \
pix1v = vec_u8_to_s16( pix1v ); \
- VEC_LOAD( p2, pix2v, n, vec_s16_t ); \
+ VEC_LOAD( p2, pix2v, n, vec_s16_t, g); \
pix2v = vec_u8_to_s16( pix2v ); \
d = vec_sub( pix1v, pix2v ); \
p1 += i1; \
* and i2
**********************************************************************/
#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \
- VEC_LOAD( p1, pix1v, 16, vec_s16_t ); \
+ pix1v = vec_ld(0, p1); \
temp0v = vec_u8_to_s16_h( pix1v ); \
temp1v = vec_u8_to_s16_l( pix1v ); \
- VEC_LOAD( p2, pix2v, 16, vec_s16_t ); \
+ VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \
temp2v = vec_u8_to_s16_h( pix2v ); \
temp3v = vec_u8_to_s16_l( pix2v ); \
dh = vec_sub( temp0v, temp2v ); \