SECTION_RODATA 32
pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
-deinterleave_sadx4: dd 0,4,2,6
hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
vbroadcasti128 m4, [r0]
vbroadcasti128 m5, [r0+FENC_STRIDE]
movu xm0, [r1]
- movu xm1, [r3]
+ movu xm1, [r2]
movu xm2, [r1+r5]
- movu xm3, [r3+r5]
- vinserti128 m0, m0, [r2], 1
+ movu xm3, [r2+r5]
+ vinserti128 m0, m0, [r3], 1
vinserti128 m1, m1, [r4], 1
- vinserti128 m2, m2, [r2+r5], 1
+ vinserti128 m2, m2, [r3+r5], 1
vinserti128 m3, m3, [r4+r5], 1
psadbw m0, m4
psadbw m1, m4
vbroadcasti128 m6, [r0+%1]
vbroadcasti128 m7, [r0+%3]
movu xm2, [r1+%2]
- movu xm3, [r3+%2]
+ movu xm3, [r2+%2]
movu xm4, [r1+%4]
- movu xm5, [r3+%4]
- vinserti128 m2, m2, [r2+%2], 1
+ movu xm5, [r2+%4]
+ vinserti128 m2, m2, [r3+%2], 1
vinserti128 m3, m3, [r4+%2], 1
- vinserti128 m4, m4, [r2+%4], 1
+ vinserti128 m4, m4, [r3+%4], 1
vinserti128 m5, m5, [r4+%4], 1
psadbw m2, m6
psadbw m3, m6
%endmacro
%macro SAD_X3_END_AVX2 0
- vextracti128 xm4, m0, 1
- vextracti128 xm5, m1, 1
- vextracti128 xm6, m2, 1
- paddw xm0, xm4
- paddw xm1, xm5
- paddw xm2, xm6
- movhlps xm4, xm0
- movhlps xm5, xm1
- movhlps xm6, xm2
- paddw xm0, xm4
- paddw xm1, xm5
- paddw xm2, xm6
movifnidn r5, r5mp
- movd [r5+0], xm0
- movd [r5+4], xm1
- movd [r5+8], xm2
+ packssdw m0, m1 ; 0 0 1 1 0 0 1 1
+ packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
+ phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
+ vextracti128 xm1, m0, 1
+ paddd xm0, xm1 ; 0 1 2 _
+ mova [r5], xm0
RET
%endmacro
%macro SAD_X4_END_AVX2 0
- mov r0, r6mp
- punpckhqdq m2, m0, m0
- punpckhqdq m3, m1, m1
- paddw m0, m2
- paddw m1, m3
- packssdw m0, m1
- mova xm2, [deinterleave_sadx4]
- vpermd m0, m2, m0
- mova [r0], xm0
+ mov r0, r6mp
+ packssdw m0, m1 ; 0 0 1 1 2 2 3 3
+ vextracti128 xm1, m0, 1
+ phaddd xm0, xm1 ; 0 1 2 3
+ mova [r0], xm0
RET
%endmacro
/* hexagon */
COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs );
- COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 );
+ COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */
bcost <<= 3;
COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
- COPY1_IF_LT( bcost, (costs[3]<<3)+5 );
- COPY1_IF_LT( bcost, (costs[4]<<3)+6 );
- COPY1_IF_LT( bcost, (costs[5]<<3)+7 );
+ COPY1_IF_LT( bcost, (costs[4]<<3)+5 );
+ COPY1_IF_LT( bcost, (costs[5]<<3)+6 );
+ COPY1_IF_LT( bcost, (costs[6]<<3)+7 );
if( bcost&7 )
{
for( i = 0; i < xn-2; i += 3 )
{
pixel *ref = p_fref_w+min_x+my*stride;
- int sads[3];
+ ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( int j = 0; j < 3; j++ )
{