Now RET checks whether it immediately follows a branch, so the programmer dosen't have to keep track of that condition.
REP_RET is still needed manually when it's a branch target, but that's much rarer.
The implementation involves lots of spurious labels, but that's ok because we strip them.
; can only be 0 or 1 and is zero over 99% of the time.
test dword [t0+cb.range], 0x100
je .renorm
- REP_RET
+ RET
.renorm:
shl dword [t0+cb.low], 1
shl dword [t0+cb.range], 1
inc dword [t0+cb.queue]
jge .putbyte
- REP_RET
+ RET
.putbyte:
PROLOGUE 0,7
mov t3d, [t0+cb.queue]
add r0, 4*FDEC_STRIDEB
dec r2
jg .loop
- REP_RET
+ RET
%endmacro ; ADD_IDCT_DC
INIT_XMM sse2
add r0, FDEC_STRIDE*4
dec r2
jg .loop
- REP_RET
+ RET
INIT_XMM sse2
cglobal add16x16_idct_dc, 2,2,8
add r4, 2
dec r3
jg .loop
- REP_RET
+ RET
cglobal deblock_h_luma, 5,7,15
add r1, r1
lea r5, [r5+r1*8]
dec r6
jg .loop
- REP_RET
+ RET
%endmacro
INIT_XMM sse2
add r4, mmsize
dec r6
jg .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
- RET
+ REP_RET
INIT_MMX cpuname
%if ARCH_X86_64
add r4, mmsize/8
dec r6
jg .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
add r4, mmsize/8
dec r5
jg .loop
- REP_RET
+ RET
cglobal deblock_intra_body
add r4, mmsize
dec r5
jg .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
dec r4
jg .loop
%endif
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
dec r5
jg .loop
%endif
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
%endif
dec r5
jg .loop
- REP_RET
+ RET
%endmacro ; DEBLOCK_CHROMA
%if ARCH_X86_64 == 0
add r4, mmsize/8
dec cntr
jg .loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
lea t5, [t5+r1*(mmsize/2)]
dec r6d
jg .loop
- REP_RET
+ RET
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
lea t0, [t0+t1*2*SIZEOF_PIXEL]
sub eax, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
%if HIGH_BIT_DEPTH
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
lea r2, [r2+r3*2]
sub r5d, 2
jg .loop
- REP_RET
+ RET
%endmacro
%macro OFFSETPN 1
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
%macro AVG2_W_TWO 3
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
cglobal pixel_avg2_w16_mmx2, 6,7
sub r4, r2
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
cglobal pixel_avg2_w18_mmx2, 6,7
sub r4, r2
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
- REP_RET
+ RET
INIT_XMM
cglobal pixel_avg2_w18_sse2, 6,7,6
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
- REP_RET
+ RET
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
INIT_MMX
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
AVG2_W16 12, movd
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%macro AVG2_W20 1
cglobal pixel_avg2_w20_%1, 6,7
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
AVG2_W20 sse2
add r0, r1
dec r5d
jg .height_loop
- REP_RET
+ RET
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
lea r0, [r0+r1*4]
sub r4d, 4
jg .height_loop
- REP_RET
+ RET
%endif
%endmacro
add r1, r2
dec r5d
jg .loop2
- REP_RET
+ RET
%if mmsize==8
.width4:
dec r5d
jg .loop4
%if mmsize!=8
- REP_RET
+ RET
%else
sub dword r7m, 4
jg .width8
- REP_RET
+ RET
.width8:
%if ARCH_X86_64
lea r3, [t2+8*SIZEOF_PIXEL]
add r1, r2
dec r5d
jg .loop1d_w4
- REP_RET
+ RET
.mc1d_w8:
sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL
lea r1, [r1+r2*2]
sub r5d, 2
jg .loop4
- REP_RET
+ RET
.width8:
movu m0, [r3]
lea r1, [r1+r2*2]
sub r5d, 2
jg .loop8
- REP_RET
+ RET
%endmacro
%if HIGH_BIT_DEPTH
mova [r0+r4+mmsize], m4
add r4, 2*mmsize
jl .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
mova [r0+r2], m1
add r2, mmsize
jl .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
mova [r0+r2+mmsize], m4
add r2, mmsize*2
jl .loop
- REP_RET
+ RET
%endmacro ; HPEL_FILTER
INIT_MMX mmx2
add r5, mmsize
add r4, mmsize
jl .loop
- REP_RET
+ RET
%endmacro
;-----------------------------------------------------------------------------
movntq [r0+r2], m1
add r2, 8
jl .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
movntq [r0+r2], m1
add r2, 8
jl .loop
- REP_RET
+ RET
INIT_XMM
movntps [r0+r2], m4
add r2, 16
jl .loop
- REP_RET
+ RET
%endmacro
;-----------------------------------------------------------------------------
movntps [r0+r2], m1
add r2, 16
jl .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
movntps [r0+r2], m3
add r2, 16
jl .loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
lea r0, [r0+r1*2]
sub r4d, 2
jg .loop
- REP_RET
+ RET
%endmacro ; PLANE_INTERLEAVE
%macro DEINTERLEAVE_START 0
add r4, r5
dec dword r7m
jg .loopy
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
lea r1, [r1+r2*2]
sub r3d, 2
jg .loop
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
lea r1, [r1+r2*2]
sub r3d, 2
jg .loop
- REP_RET
+ RET
%endmacro ; PLANE_DEINTERLEAVE
%if HIGH_BIT_DEPTH
sub r2d, 32
jg .copy32
.ret
- REP_RET
+ RET
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
%endrep
add r1, mmsize*8
jl .loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx
movdqa [r3+r2*2+16], m1
add r2, 16
jl .loop
- REP_RET
+ RET
%macro INTEGRAL_INIT8H 0
cglobal integral_init8h, 3,4
movdqa [r3+r2*2+16], m1
add r2, 16
jl .loop
- REP_RET
+ RET
%endmacro
INIT_XMM sse4
mova [r0+r1+mmsize], m1
add r1, 2*mmsize
jl .loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx
mova [r1+r2-8], m3
sub r2, 8
jge .loop
- REP_RET
+ RET
INIT_XMM
cglobal integral_init4v_sse2, 3,5
mova [r1+r2], m3
add r2, 16
jl .loop
- REP_RET
+ RET
cglobal integral_init4v_ssse3, 3,5
shl r2, 1
mova [r1+r2], m3
add r2, 16
jl .loop
- REP_RET
+ RET
%macro FILT8x4 7
mova %3, [r0+%7]
movdqa [r0+r6*2], xmm0
add r6, 8
jl .loop
- REP_RET
+ RET
%endmacro
INIT_XMM sse2
vmovdqu [r0+r6*2], ymm1
add r6, 16
jl .loop
- REP_RET
+ RET
psrlw m0, 4
SPLATW m0, m0
STORE8x8 m0, m0
- REP_RET
+ RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
add r0, FDEC_STRIDE
dec r1d
jg .loop
- REP_RET
+ RET
%endmacro ; PREDICT_CHROMA_P_MMX
INIT_MMX mmx2
add r0, FDEC_STRIDEB
dec r1d
jg .loop
- REP_RET
+ RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_8x%1c_p_core, 1,2
movd m0, r1m
add r0, FDEC_STRIDE
dec r1d
jg .loop
- REP_RET
+ RET
%endif ; !ARCH_X86_64
%macro PREDICT_16x16_P 0
dec r1d
jg .loop
%endif ; !HIGH_BIT_DEPTH
- REP_RET
+ RET
%endmacro ; PREDICT_16x16_P
INIT_XMM sse2
mova m2, [r0 - FDEC_STRIDEB+16]
mova m3, [r0 - FDEC_STRIDEB+24]
STORE16x16 m0, m1, m2, m3
- REP_RET
+ RET
INIT_XMM
cglobal predict_16x16_v_sse2, 2,2
mova m0, [r0 - FDEC_STRIDEB+ 0]
mova m1, [r0 - FDEC_STRIDEB+16]
STORE16x16_SSE2 m0, m1
- REP_RET
+ RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx2, 1,2
movq m0, [r0 - FDEC_STRIDE + 0]
movq m1, [r0 - FDEC_STRIDE + 8]
STORE16x16 m0, m1
- REP_RET
+ RET
INIT_XMM
cglobal predict_16x16_v_sse2, 1,1
movdqa xmm0, [r0 - FDEC_STRIDE]
%endif ; HIGH_BIT_DEPTH
sub r1, 4*FDEC_STRIDEB
jge .vloop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
%else
PRED16x16_DC r1m, 5
%endif
- REP_RET
+ RET
INIT_MMX mmx2
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
- REP_RET
+ RET
INIT_MMX mmx2
%if HIGH_BIT_DEPTH
movd m0, r1m
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
- REP_RET
+ RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
STORE16x16 m0, m0
- REP_RET
+ RET
%endif
;-----------------------------------------------------------------------------
cglobal predict_16x16_dc_core, 2,2,4
movd m3, r1m
PRED16x16_DC_SSE2 m3, 5
- REP_RET
+ RET
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC_SSE2 [pw_8], 4
- REP_RET
+ RET
INIT_XMM sse2
%if HIGH_BIT_DEPTH
movd m0, r1m
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
- REP_RET
+ RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
%1 [r0+(t0+8*%3)*SIZEOF_PIXEL], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
sub t0d, 16*%3
jge %%loop
- REP_RET
+ RET
%else
%1 [r0+(8*%3)*SIZEOF_PIXEL], [r1+16*%3], [r1+24*%3]
%1 [r0+(0 )*SIZEOF_PIXEL], [r1+0 ], [r1+ 8*%3]
PSIGND m5, m2, m1
test t3d, t3d
jnz .outer_loop_0
- REP_RET
+ RET
%endmacro
%if HIGH_BIT_DEPTH == 0
mova [r1+r3*4-1*mmsize], m5
sub r3, mmsize/2
jg .loop
- REP_RET
+ RET
%endmacro
%if ARCH_X86_64 == 0
mova [r1+r3*4-1*mmsize], m1
sub r3, mmsize
jg .loop
- REP_RET
+ RET
%endmacro
%if ARCH_X86_64 == 0
jne .loop
%endif
.ret:
- RET
+ REP_RET
.ret9:
mov eax, 9
RET
.tryret:
xor r4, -1
jne .cont
- REP_RET
+ RET
.ret9:
mov eax, 9
RET
shr r3, cl
shr r3, 1
jne .loop
- REP_RET
+ RET
%endif ; ARCH
%endmacro
inc t6d
sub t4d, t3d
jge .loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
; Pops anything that was pushed by PROLOGUE, and returns.
; REP_RET:
-; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
-; which are slow when a normal ret follows a branch.
+; Use this instead of RET if it's a branch target.
; registers:
; rN and rNq are the native-size register holding function argument N
%if mmsize == 32
vzeroupper
%endif
- ret
+ AUTO_REP_RET
%endmacro
%elif ARCH_X86_64 ; *nix x64 ;=============================================
%if mmsize == 32
vzeroupper
%endif
- ret
+ AUTO_REP_RET
%endmacro
%else ; X86_32 ;==============================================================
%if mmsize == 32
vzeroupper
%endif
- ret
+ AUTO_REP_RET
%endmacro
%endif ;======================================================================
%endmacro
%endif
+; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
+; a branch or a branch target. So switch to a 2-byte form of ret in that case.
+; We can automatically detect "follows a branch", but not a branch target.
+; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
%if has_epilogue
RET
%endif
%endmacro
+%define last_branch_adr $$
+%macro AUTO_REP_RET 0
+ %ifndef cpuflags
+ times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
+ %elif notcpuflag(ssse3)
+ times ((last_branch_adr-$)>>31)+1 rep
+ %endif
+ ret
+%endmacro
+
+%macro BRANCH_INSTR 0-*
+ %rep %0
+ %macro %1 1-2 %1
+ %2 %1
+ %%branch_instr:
+ %xdefine last_branch_adr %%branch_instr
+ %endmacro
+ %rotate 1
+ %endrep
+%endmacro
+
+BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
+
%macro TAIL_CALL 2 ; callee, is_nonadjacent
%if has_epilogue
call %1
mov dword [r1], 0
mov eax, r3
.ok:
- RET
+ REP_RET
%endif ; ARCH_X86_64