Also make memcpy_aligned support sizes smaller than 64.
vst1.64 {d0-d1}, [r3,:r3align]!
32: // n is a multiple of 32
tst r2, #32
- beq 64f
+ beq 640f
sub r2, #32
vld1.64 {d0-d3}, [r1,:r1align]!
vst1.64 {d0-d3}, [r3,:r3align]!
-64: // n is a multiple of 64
+640: // n is a multiple of 64
+ cmp r2, #0
+ beq 1f
+64:
subs r2, #64
vld1.64 {d0-d3}, [r1,:r1align]!
vld1.64 {d4-d7}, [r1,:r1align]!
vst1.64 {d0-d3}, [r3,:r3align]!
vst1.64 {d4-d7}, [r3,:r3align]!
bgt 64b
+1: // end
.if \srcalign == 8 && \dstalign == 8
vld1.64 {d0}, [r1,:64]!
vst1.64 {d0}, [r3,:64]!
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
-; but they also will fail if given a non-mod16 size or a size less than 64.
+; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
- jz .copy32
+ jz .copy32start
sub r2d, 16
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
+.copy32start
+ test r2d, r2d
+ jz .ret
.copy32:
sub r2d, 32
movq mm0, [r1 + r2 + 0]
movq [r0 + r2 + 16], mm2
movq [r0 + r2 + 24], mm3
jg .copy32
+.ret
REP_RET
;-----------------------------------------------------------------------------
movdqa [r0 + r2], xmm0
.copy32:
test r2d, 32
- jz .copy64
+ jz .copy64start
sub r2d, 32
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
+.copy64start
+ test r2d, r2d
+ jz .ret
.copy64:
sub r2d, 64
movdqa xmm0, [r1 + r2 + 0]
movdqa xmm3, [r1 + r2 + 48]
movdqa [r0 + r2 + 48], xmm3
jg .copy64
+.ret:
REP_RET
;-----------------------------------------------------------------------------
report( "mbtree propagate :" );
}
+ if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
+ {
+ set_func_name( "memcpy_aligned" );
+ ok = 1; used_asm = 1;
+ for( int size = 16; size < 256; size += 16 )
+ {
+ memset( buf4, 0xAA, size + 1 );
+ call_c( mc_c.memcpy_aligned, buf3, buf1, size );
+ call_a( mc_a.memcpy_aligned, buf4, buf1, size );
+ if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+ {
+ ok = 0;
+ fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size );
+ break;
+ }
+ }
+ report( "memcpy aligned :" );
+ }
+
+ if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
+ {
+ set_func_name( "memzero_aligned" );
+ ok = 1; used_asm = 1;
+ for( int size = 128; size < 1024; size += 128 )
+ {
+ memset( buf4, 0xAA, size + 1 );
+ call_c( mc_c.memzero_aligned, buf3, size );
+ call_a( mc_a.memzero_aligned, buf4, size );
+ if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+ {
+ ok = 0;
+ fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size );
+ break;
+ }
+ }
+ report( "memzero aligned :" );
+ }
+
return ret;
}