; t3 must be ecx, since it's used for shift.
%ifdef WIN64
- DECLARE_REG_TMP 3,1,2,0,4,5,6,2
+ DECLARE_REG_TMP 3,1,2,0,6,5,4,2
%define pointer resq
%elifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
%macro LOAD_GLOBAL 4
%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- lea r11, [%2]
+ lea r7, [%2]
%ifnidn %3, 0
- add r11, %3
+ add r7, %3
%endif
- movzx %1, byte [r11+%4]
+ movzx %1, byte [r7+%4]
%else
movzx %1, byte [%2+%3+%4]
%endif
and t4d, t6d
shr t5d, 6
movifnidn t2d, r2m
+%ifdef WIN64
+ PUSH r7
+%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2
and t6d, 1
mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
+%ifdef WIN64
+ POP r7
+%endif
shl t4d, t3b
shl t6d, t3b
mov [t0+cb.range], t4d
PROLOGUE 0,7
mov t3d, [t0+cb.queue]
mov t6d, [t0+cb.low]
- jmp cabac_putbyte
cabac_putbyte:
; alive: t0=cb t3=queue t6=low
%ifdef WIN64
- DECLARE_REG_TMP 3,4,1,0,2,5,6,10
+ DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif
mov t1d, -1
add t3d, 10
%endif
%endif ; !HIGH_BIT_DEPTH
.skip_prologue:
-%ifdef WIN64
- sub rsp, 8
-%endif
call %2.skip_prologue
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
%ifdef WIN64
call %2.skip_prologue
- add rsp, 8
RET
%else
jmp %2.skip_prologue
add r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
-%ifdef WIN64
- sub rsp, 8
-%endif
call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
add r1, %3
%ifdef WIN64
call %2.skip_prologue
- add rsp, 8
RET
%else
jmp %2.skip_prologue
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma, 5,7
- movsxd r10, r1d
- lea r11, [r10+r10*2]
- lea r6, [r0-4]
- lea r5, [r0-4+r11]
+cglobal deblock_h_luma, 5,9
+ movsxd r7, r1d
+ lea r8, [r7*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
%ifdef WIN64
- sub rsp, 0x98
+ sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
- sub rsp, 0x68
+ sub rsp, 0x68
%define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%ifdef WIN64
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
%ifdef WIN64
add rsp, 0x98
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7
- movsxd r10, r1d
- lea r11, [r10*3]
- lea r6, [r0-4]
- lea r5, [r0-4+r11]
- sub rsp, 0x88
+cglobal deblock_h_luma_intra, 4,9
+ movsxd r7, r1d
+ lea r8, [r7*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
+ sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r7*8]
+ lea r5, [r5+r7*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- lea r5, [r6+r11]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- add rsp, 0x88
+ lea r5, [r6+r8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra, 2,4
%endif
%macro DEBLOCK_H_CHROMA_422 0
-cglobal deblock_h_chroma_422, 5,7,8
+cglobal deblock_h_chroma_422, 5,8,8
%ifdef ARCH_X86_64
- %define cntr r11
+ %define cntr r7
%else
%define cntr dword r0m
%endif
; implicit weighted biprediction
;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-%ifdef ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,10,11
- %macro AVG_START 0-1 0
- PROLOGUE 6,7,%1
%ifdef WIN64
- movsxd r5, r5d
-%endif
+ DECLARE_REG_TMP 0,1,2,3,4,5,4,5
+ %macro AVG_START 0-1 0
+ PROLOGUE 5,7,%1
+ movsxd r5, dword r5m
+ %endmacro
+%elifdef UNIX64
+ DECLARE_REG_TMP 0,1,2,3,4,5,7,8
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,9,%1
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
jg avg_w16_align%1_%2_ssse3
ret
%if %1==0
- times 13 db 0x90 ; make sure the first ones don't end up short
+ ; make sure the first ones don't end up short
+ ALIGN 16
+ times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
%endif
%endmacro
and eax, 7
jz x264_pixel_avg2_w16_sse2
%endif
- PROLOGUE 6, 7
+ PROLOGUE 6, 8
lea r6, [r4+r2]
and r4, ~0xf
and r6, 0x1f
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
- lea r11, [avg_w16_addr]
- add r6, r11
+ lea r7, [avg_w16_addr]
+ add r6, r7
%else
lea r6, [avg_w16_addr + r6]
%endif
;=============================================================================
%ifdef ARCH_X86_64
- DECLARE_REG_TMP 10,11,6
+ DECLARE_REG_TMP 6,7,8
%else
DECLARE_REG_TMP 0,1,2
%endif
-%macro MC_CHROMA_START 0
+%macro MC_CHROMA_START 1
+%ifdef ARCH_X86_64
+ PROLOGUE 0,9,%1
+%else
+ PROLOGUE 0,6,%1
+%endif
movifnidn r3, r3mp
movifnidn r4d, r4m
movifnidn r5d, r5m
- movifnidn t2d, r6m
- mov t0d, t2d
+ movifnidn t0d, r6m
+ mov t2d, t0d
mov t1d, r5d
sar t0d, 3
sar t1d, 3
; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 0
-cglobal mc_chroma, 0,6
- MC_CHROMA_START
+cglobal mc_chroma
+ MC_CHROMA_START 0
FIX_STRIDES r4
and r5d, 7
%ifdef ARCH_X86_64
movifnidn r5d, r8m
cmp dword r7m, 4
jg .mc1d_w8
- mov r10, r2
- mov r11, r4
+ mov r7, r2
+ mov r8, r4
%if mmsize!=8
shr r5d, 1
%endif
%else
movu m0, [r3]
movu m1, [r3+r6]
- add r3, r11
+ add r3, r8
movu m2, [r3]
movu m3, [r3+r6]
%endif
movq m0, [r3]
movq m1, [r3+r6]
%if mmsize!=8
- add r3, r11
+ add r3, r8
movhps m0, [r3]
movhps m1, [r3+r6]
%endif
psrlw m2, 3
%ifdef HIGH_BIT_DEPTH
%if mmsize == 8
- xchg r4, r11
- xchg r2, r10
+ xchg r4, r8
+ xchg r2, r7
%endif
movq [r0], m0
movq [r1], m2
%if mmsize == 16
- add r0, r10
- add r1, r10
+ add r0, r7
+ add r1, r7
movhps [r0], m0
movhps [r1], m2
%endif
%else ; !HIGH_BIT_DEPTH
packuswb m0, m2
%if mmsize==8
- xchg r4, r11
- xchg r2, r10
+ xchg r4, r8
+ xchg r2, r7
movd [r0], m0
psrlq m0, 32
movd [r1], m0
movhlps m1, m0
movd [r0], m0
movd [r1], m1
- add r0, r10
- add r1, r10
+ add r0, r7
+ add r1, r7
psrldq m0, 4
psrldq m1, 4
movd [r0], m0
.mc1d_w8:
sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL
- mov r10, 4*SIZEOF_PIXEL
- mov r11, 8*SIZEOF_PIXEL
+ mov r7, 4*SIZEOF_PIXEL
+ mov r8, 8*SIZEOF_PIXEL
%if mmsize==8
shl r5d, 1
%endif
%endif ; ARCH_X86_64
%endmacro ; MC_CHROMA
-
%macro MC_CHROMA_SSSE3 0
-cglobal mc_chroma, 0,6,9
- MC_CHROMA_START
+cglobal mc_chroma
+ MC_CHROMA_START 9
and r5d, 7
and t2d, 7
mov t0d, r5d
mova %1, m1
mova %2, m4
FILT_PACK m1, m4, 5, m15
- movntps [r11+r4+%5], m1
+ movntps [r8+r4+%5], m1
%endmacro
%macro FILT_C 4
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal hpel_filter, 7,7,16
+cglobal hpel_filter, 7,9,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
%endif
- mov r10, r3
+ mov r7, r3
sub r5, 16
- mov r11, r1
- and r10, 15
- sub r3, r10
+ mov r8, r1
+ and r7, 15
+ sub r3, r7
add r0, r5
- add r11, r5
- add r10, r5
+ add r8, r5
+ add r7, r5
add r5, r2
mov r2, r4
- neg r10
+ neg r7
lea r1, [r3+r2]
sub r3, r2
sub r3, r2
- mov r4, r10
+ mov r4, r7
mova m15, [pw_16]
%if cpuflag(ssse3)
mova m0, [filt_mul51]
cmp r4, 16
jl .lastx
; setup regs for next y
- sub r4, r10
+ sub r4, r7
sub r4, r2
sub r1, r4
sub r3, r4
add r0, r2
- add r11, r2
+ add r8, r2
add r5, r2
- mov r4, r10
+ mov r4, r7
sub r6d, 1
jg .loopy
sfence
; uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 7,7
+cglobal plane_copy_interleave_core, 7,9
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r1m, r1d
add r2, r6
add r4, r6
%ifdef ARCH_X86_64
- DECLARE_REG_TMP 10,11
+ DECLARE_REG_TMP 7,8
%else
DECLARE_REG_TMP 1,3
%endif
%macro BACKUP_POINTERS 0
%ifdef ARCH_X86_64
- mov r10, r0
- mov r11, r2
+%ifdef WIN64
+ PUSH r7
+%endif
+ mov r6, r0
+ mov r7, r2
%endif
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
%ifdef ARCH_X86_64
- lea r0, [r10+8]
- lea r2, [r11+8]
+ lea r0, [r6+8]
+ lea r2, [r7+8]
+%ifdef WIN64
+ POP r7
+%endif
%else
mov r0, r0mp
mov r2, r2mp
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
- lea r10, [r0+4*r1]
- lea r11, [r2+4*r3]
+ lea r6, [r0+4*r1]
+ lea r7, [r2+4*r3]
LOAD_SUMSUB_8x4P 0, 1, 2, 8, 5, 6, 7, r0, r2
- LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r10, r11
+ LOAD_SUMSUB_8x4P 4, 5, 3, 9, 11, 6, 7, r6, r7
%if vertical
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
%else ; non-sse2
SAVE_MM_PERMUTATION
ret
-cglobal pixel_sa8d_8x8, 4,6,12
+cglobal pixel_sa8d_8x8, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
shr eax, 1
RET
-cglobal pixel_sa8d_16x16, 4,6,12
+cglobal pixel_sa8d_16x16, 4,8,12
FIX_STRIDES r1, r3
lea r4, [3*r1]
lea r5, [3*r3]
%endif
RET
-%ifdef ARCH_X86_64
- %define t0 r10
- %define t2 r11
-%else
- %define t0 r0
- %define t2 r2
-%endif
-
;-----------------------------------------------------------------------------
; void intra_satd_x3_16x16( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
%endif
; 1D hadamards
- mov t0d, 12
+ mov r3d, 12
movd m6, [pw_32]
.loop_edge:
- SCALAR_HADAMARD left, t0, m0, m1
- SCALAR_HADAMARD top, t0, m1, m2, m3
+ SCALAR_HADAMARD left, r3, m0, m1
+ SCALAR_HADAMARD top, r3, m1, m2, m3
pavgw m0, m1
paddw m6, m0
- sub t0d, 4
+ sub r3d, 4
jge .loop_edge
psrlw m6, 2
pand m6, [sw_f0] ; dc
ADD rsp, stack_pad
RET
+%ifdef ARCH_X86_64
+ %define t0 r6
+%else
+ %define t0 r2
+%endif
+
;-----------------------------------------------------------------------------
; void intra_satd_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
mova [sums+16], m7
; 1D hadamards
- mov t0d, 4
+ mov r3d, 4
.loop_edge:
- SCALAR_HADAMARD left, t0, m0, m1
- SCALAR_HADAMARD top, t0, m0, m1, m2
- sub t0d, 4
+ SCALAR_HADAMARD left, r3, m0, m1
+ SCALAR_HADAMARD top, r3, m0, m1, m2
+ sub r3d, 4
jge .loop_edge
; dc
- movzx t2d, word [left_1d+0]
+ movzx t0d, word [left_1d+0]
movzx r3d, word [top_1d+0]
movzx r4d, word [left_1d+8]
movzx r5d, word [top_1d+8]
- lea t2d, [t2 + r3 + 16]
+ lea t0d, [t0 + r3 + 16]
lea r3d, [r4 + r5 + 16]
- shr t2d, 1
+ shr t0d, 1
shr r3d, 1
add r4d, 8
add r5d, 8
- and t2d, -16 ; tl
+ and t0d, -16 ; tl
and r3d, -16 ; br
and r4d, -16 ; bl
and r5d, -16 ; tr
- mov [dc_1d+ 0], t2d ; tl
+ mov [dc_1d+ 0], t0d ; tl
mov [dc_1d+ 4], r5d ; tr
mov [dc_1d+ 8], r4d ; bl
mov [dc_1d+12], r3d ; br
;This is not true for score64.
cglobal decimate_score%1, 1,3
%ifdef PIC
- lea r10, [decimate_table4]
- lea r11, [decimate_mask_table4]
- %define table r10
- %define mask_table r11
+ lea r4, [decimate_table4]
+ lea r5, [decimate_mask_table4]
+ %define table r4
+ %define mask_table r5
%else
%define table decimate_table4
%define mask_table decimate_mask_table4
%macro DECIMATE8x8 0
%ifdef ARCH_X86_64
-cglobal decimate_score64, 1,4
+cglobal decimate_score64, 1,5
%ifdef PIC
- lea r10, [decimate_table8]
- %define table r10
+ lea r4, [decimate_table8]
+ %define table r4
%else
%define table decimate_table8
%endif
jmp pixel_sad_x3_%1x%2_%4
.split:
%ifdef ARCH_X86_64
- PROLOGUE 6,7
+ PROLOGUE 6,9
%ifdef WIN64
movsxd r4, r4d
sub rsp, 8
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r4
- mov r10, r0
- mov r11, r5
+ mov r7, r0
+ mov r8, r5
call pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
+ mov [r8], eax
%ifdef WIN64
mov r2, [rsp]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
+ mov [r8+4], eax
%ifdef WIN64
mov r2, [rsp+8]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
+ mov [r8+8], eax
%ifdef WIN64
add rsp, 24
%endif
jmp pixel_sad_x4_%1x%2_%4
.split:
%ifdef ARCH_X86_64
- PROLOGUE 6,7
- mov r11, r6mp
+ PROLOGUE 6,9
+ mov r8, r6mp
%ifdef WIN64
movsxd r5, r5d
%endif
mov r2, r1
mov r1, FENC_STRIDE
mov r3, r5
- mov r10, r0
+ mov r7, r0
call pixel_sad_%1x%2_cache%3_%5
- mov [r11], eax
+ mov [r8], eax
%ifdef WIN64
mov r2, [rsp]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+4], eax
+ mov [r8+4], eax
%ifdef WIN64
mov r2, [rsp+8]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+8], eax
+ mov [r8+8], eax
%ifdef WIN64
mov r2, [rsp+16]
%else
pop r2
%endif
- mov r0, r10
+ mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
- mov [r11+12], eax
+ mov [r8+12], eax
%ifdef WIN64
add rsp, 24
%endif
;*****************************************************************************
;* x86inc.asm: x264asm abstraction layer
;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
+;* Copyright (C) 2005-2012 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Anton Mitrofanov <BugMaster@narod.ru>
;* Fiona Glaser <fiona@x264.com>
+;* Henrik Gramner <hengar-6@student.ltu.se>
;*
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
default rel
%endif
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU intelnop
+
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size
-%macro DECLARE_REG 6
+%macro DECLARE_REG 5-6
%define r%1q %2
%define r%1d %3
%define r%1w %4
%define r%1b %5
- %define r%1m %6
- %ifid %6 ; i.e. it's a register
+ %if %0 == 5
+ %define r%1m %3
%define r%1mp %2
%elifdef ARCH_X86_64 ; memory
- %define r%1mp qword %6
+ %define r%1m [rsp + stack_offset + %6]
+ %define r%1mp qword r %+ %1m
%else
- %define r%1mp dword %6
+ %define r%1m [esp + stack_offset + %6]
+ %define r%1mp dword r %+ %1m
%endif
%define r%1 %2
%endmacro
%endrep
%endmacro
-DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%ifdef ARCH_X86_64
%define gprsize 8
%assign stack_offset stack_offset-gprsize
%endmacro
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
%macro SUB 2
sub %1, %2
%ifidn %1, rsp
%ifdef WIN64 ; Windows x64 ;=================================================
-DECLARE_REG 0, rcx, ecx, cx, cl, ecx
-DECLARE_REG 1, rdx, edx, dx, dl, edx
-DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
-DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
-DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
-DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
-DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
-%define r7m [rsp + stack_offset + 64]
-%define r8m [rsp + stack_offset + 72]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [rsp + stack_offset + 8 + %1*8]
- %endif
-%endmacro
+DECLARE_REG 0, rcx, ecx, cx, cl
+DECLARE_REG 1, rdx, edx, dx, dl
+DECLARE_REG 2, R8, R8D, R8W, R8B
+DECLARE_REG 3, R9, R9D, R9W, R9B
+DECLARE_REG 4, R10, R10D, R10W, R10B, 40
+DECLARE_REG 5, R11, R11D, R11W, R11B, 48
+DECLARE_REG 6, rax, eax, ax, al, 56
+DECLARE_REG 7, rdi, edi, di, dil, 64
+DECLARE_REG 8, rsi, esi, si, sil, 72
+DECLARE_REG 9, rbx, ebx, bx, bl, 80
+DECLARE_REG 10, rbp, ebp, bp, bpl, 88
+DECLARE_REG 11, R12, R12D, R12W, R12B, 96
+DECLARE_REG 12, R13, R13D, R13W, R13B, 104
+DECLARE_REG 13, R14, R14D, R14W, R14B, 112
+DECLARE_REG 14, R15, R15D, R15W, R15B, 120
%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
+ %assign num_args %1
%assign regs_used %2
- ASSERT regs_used <= 7
- %if regs_used > 4
- push r4
- push r5
- %assign stack_offset stack_offset+16
- %endif
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
WIN64_SPILL_XMM %3
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
%endmacro
%endif
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
- sub rsp, (xmm_regs_used-6)*16+16
- %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
+ SUB rsp, (xmm_regs_used-6)*16+16
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
+ movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
%endrep
%endif
%endmacro
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%assign %%i %%i-1
- movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
%endrep
add %1, (xmm_regs_used-6)*16+16
%endif
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
- %if regs_used > 4
- pop r5
- pop r4
- %endif
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
ret
%endmacro
%macro REP_RET 0
- %if regs_used > 4 || xmm_regs_used > 6
+ %if regs_used > 7 || xmm_regs_used > 6
RET
%else
rep ret
%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
-DECLARE_REG 0, rdi, edi, di, dil, edi
-DECLARE_REG 1, rsi, esi, si, sil, esi
-DECLARE_REG 2, rdx, edx, dx, dl, edx
-DECLARE_REG 3, rcx, ecx, cx, cl, ecx
-DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
-DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
-DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
-%define r7m [rsp + stack_offset + 16]
-%define r8m [rsp + stack_offset + 24]
-
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [rsp - 40 + %1*8]
- %endif
-%endmacro
+DECLARE_REG 0, rdi, edi, di, dil
+DECLARE_REG 1, rsi, esi, si, sil
+DECLARE_REG 2, rdx, edx, dx, dl
+DECLARE_REG 3, rcx, ecx, cx, cl
+DECLARE_REG 4, R8, R8D, R8W, R8B
+DECLARE_REG 5, R9, R9D, R9W, R9B
+DECLARE_REG 6, rax, eax, ax, al, 8
+DECLARE_REG 7, R10, R10D, R10W, R10B, 16
+DECLARE_REG 8, R11, R11D, R11W, R11B, 24
+DECLARE_REG 9, rbx, ebx, bx, bl, 32
+DECLARE_REG 10, rbp, ebp, bp, bpl, 40
+DECLARE_REG 11, R12, R12D, R12W, R12B, 48
+DECLARE_REG 12, R13, R13D, R13W, R13B, 56
+DECLARE_REG 13, R14, R14D, R14W, R14B, 64
+DECLARE_REG 14, R15, R15D, R15W, R15B, 72
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
- ASSERT %2 <= 7
- LOAD_IF_USED 6, %1
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS %4
%endmacro
%macro RET 0
+ POP_IF_USED 14, 13, 12, 11, 10, 9
ret
%endmacro
%macro REP_RET 0
- rep ret
+ %if regs_used > 9
+ RET
+ %else
+ rep ret
+ %endif
%endmacro
%else ; X86_32 ;==============================================================
-DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
-DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
-DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
-DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
-DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
-DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
-DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
-%define r7m [esp + stack_offset + 32]
-%define r8m [esp + stack_offset + 36]
+DECLARE_REG 0, eax, eax, ax, al, 4
+DECLARE_REG 1, ecx, ecx, cx, cl, 8
+DECLARE_REG 2, edx, edx, dx, dl, 12
+DECLARE_REG 3, ebx, ebx, bx, bl, 16
+DECLARE_REG 4, esi, esi, si, null, 20
+DECLARE_REG 5, edi, edi, di, null, 24
+DECLARE_REG 6, ebp, ebp, bp, null, 28
%define rsp esp
-%macro PUSH_IF_USED 1 ; reg_id
- %if %1 < regs_used
- push r%1
- %assign stack_offset stack_offset+4
- %endif
-%endmacro
-
-%macro POP_IF_USED 1 ; reg_id
- %if %1 < regs_used
- pop r%1
- %endif
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
%endmacro
-%macro LOAD_IF_USED 2 ; reg_id, number_of_args
- %if %1 < %2
- mov r%1, [esp + stack_offset + 4 + %1*4]
- %endif
-%endmacro
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
- ASSERT %2 >= %1
+ %assign num_args %1
%assign regs_used %2
- ASSERT regs_used <= 7
- PUSH_IF_USED 3
- PUSH_IF_USED 4
- PUSH_IF_USED 5
- PUSH_IF_USED 6
- LOAD_IF_USED 0, %1
- LOAD_IF_USED 1, %1
- LOAD_IF_USED 2, %1
- LOAD_IF_USED 3, %1
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ ASSERT regs_used >= num_args
+ PUSH_IF_USED 3, 4, 5, 6
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
DEFINE_ARGS %4
%endmacro
%macro RET 0
- POP_IF_USED 6
- POP_IF_USED 5
- POP_IF_USED 4
- POP_IF_USED 3
+ POP_IF_USED 6, 5, 4, 3
ret
%endmacro
%endmacro
%endif
-
-
;=============================================================================
; arch-independent part
;=============================================================================
%ifdef WIN64
; just random numbers to reduce the chance of incidental match
ALIGN 16
-n4: dq 0xa77809bf11b239d1
-n5: dq 0x2ba9bf3d2f05b389
x6: ddq 0x79445c159ce790641a1b2550a612b48c
x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd
x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943
x13: ddq 0xdd7b8919edd427862e8ec680de14b47c
x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf
x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
+n7: dq 0x21f86d66c8ca00ce
+n8: dq 0x75b6ba21077c48ad
+n9: dq 0xed56bb2dcb3c7736
+n10: dq 0x8bda43d3fd1a7e06
+n11: dq 0xb64a9c9e5d318408
+n12: dq 0xdf9a54b303f1d3a3
+n13: dq 0x4a75479abd64e097
+n14: dq 0x249214109d5d1c88
%endif
SECTION .text
; max number of args used by any x264 asm function.
; (max_args % 4) must equal 3 for stack alignment
-%define max_args 11
+%define max_args 15
%ifdef WIN64
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
INIT_XMM
-cglobal checkasm_call, 4,7,16
- sub rsp, max_args*8
- %assign stack_offset stack_offset+max_args*8
+cglobal checkasm_call, 4,15,16
+ SUB rsp, max_args*8
mov r6, r0
mov [rsp+stack_offset+16], r1
mov r0, r2
%endrep
%assign i 6
%rep 16-6
- movdqa xmm %+ i, [x %+ i]
+ mova m %+ i, [x %+ i]
+ %assign i i+1
+%endrep
+%assign i 7
+%rep 15-7
+ mov r %+ i, [n %+ i]
%assign i i+1
%endrep
- mov r4, [n4]
- mov r5, [n5]
call r6
- xor r4, [n4]
- xor r5, [n5]
- or r4, r5
- pxor xmm5, xmm5
+%assign i 7
+%rep 15-7
+ xor r %+ i, [n %+ i]
+ or r7, r %+ i
+ %assign i i+1
+%endrep
%assign i 6
%rep 16-6
- pxor xmm %+ i, [x %+ i]
- por xmm5, xmm %+ i
+ pxor m %+ i, [x %+ i]
+ por m6, m %+ i
%assign i i+1
%endrep
- packsswb xmm5, xmm5
- movq r5, xmm5
- or r4, r5
+ packsswb m6, m6
+ movq r5, m6
+ or r7, r5
jz .ok
mov r4, rax
lea r0, [error_message]
mov dword [r1], 0
mov rax, r4
.ok:
- add rsp, max_args*8
- %assign stack_offset stack_offset-max_args*8
+ ADD rsp, max_args*8
RET
%elifndef ARCH_X86_64