db 4+13*8, 5+13*8, 4+14*8, 5+14*8
db 6+13*8, 7+13*8, 6+14*8, 7+14*8
%ifdef PIC
-%define scan8 r11
+%define npicregs 1
+%define scan8 picregq
%else
+%define npicregs 0
%define scan8 scan8_mem
%endif
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
%endif
paddw m0, [pw_32]
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
mova [%2 ], m0
mova [%2+16], m4
IDCT8_1D [%2], [%2+ 16]
STORE_DIFF m1, m6, m7, [%1+%3 ]
STORE_DIFF m2, m6, m7, [%1+%3*2]
STORE_DIFF m3, m6, m7, [%1+%4 ]
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
mova m0, [%2 ]
mova m1, [%2+16]
%else
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16_8_mmx, 5, 7, 0
+cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct8_add4_8_mmx, 5, 7, 0
+cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16_8_mmx2, 5, 7, 0
+cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
test r6, r6
jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6
-%ifdef ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
-%endif
- mov dst_regd, dword [r1+r5*4]
- lea dst_reg, [r0+dst_reg]
- DC_ADD_MMX2_OP movh, dst_reg, r3, r6
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ lea dst2q, [r0+dst2q]
+ DC_ADD_MMX2_OP movh, dst2q, r3, r6
+%if ARCH_X86_64 == 0
mov r1, r1m
%endif
inc r5
; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
+cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
+cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
test r6, r6
jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6
-%ifdef ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
-%endif
- mov dst_regd, dword [r1+r5*4]
- add dst_reg, r0
- DC_ADD_MMX2_OP movh, dst_reg, r3, r6
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ DC_ADD_MMX2_OP movh, dst2q, r3, r6
+%if ARCH_X86_64 == 0
mov r1, r1m
%endif
.skipblock
; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
+cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
test r6, r6
jz .no_dc
DC_ADD_MMX2_INIT r2, r3, r6
-%ifdef ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
-%endif
- mov dst_regd, dword [r1+r5*4]
- lea dst_reg, [r0+dst_reg]
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
- lea dst_reg, [dst_reg+r3*4]
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ lea dst2q, [r0+dst2q]
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
+ lea dst2q, [dst2q+r3*4]
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
+%if ARCH_X86_64 == 0
mov r1, r1m
%endif
add r5, 4
INIT_XMM
; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct8_add4_8_sse2, 5, 7, 10
+cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
.nextblock
movzx r6, byte [scan8+r5]
jz .no_dc
INIT_MMX
DC_ADD_MMX2_INIT r2, r3, r6
-%ifdef ARCH_X86_64
-%define dst_reg r10
-%define dst_regd r10d
-%else
-%define dst_reg r1
-%define dst_regd r1d
-%endif
- mov dst_regd, dword [r1+r5*4]
- add dst_reg, r0
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
- lea dst_reg, [dst_reg+r3*4]
- DC_ADD_MMX2_OP mova, dst_reg, r3, r6
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
+%define dst2q r1
+%define dst2d r1d
+%endif
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
+ lea dst2q, [dst2q+r3*4]
+ DC_ADD_MMX2_OP mova, dst2q, r3, r6
+%if ARCH_X86_64 == 0
mov r1, r1m
%endif
add r5, 4
REP_RET
.no_dc
INIT_XMM
- mov dst_regd, dword [r1+r5*4]
- add dst_reg, r0
- IDCT8_ADD_SSE dst_reg, r2, r3, r6
-%ifndef ARCH_X86_64
+ mov dst2d, dword [r1+r5*4]
+ add dst2q, r0
+ IDCT8_ADD_SSE dst2q, r2, r3, r6
+%if ARCH_X86_64 == 0
mov r1, r1m
%endif
.skipblock
or r6w, word [r2]
test r6, r6
jz .skipblock
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
- add r0, [r10]
+ add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add8_8_mmx, 5, 7, 0
+cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
-%ifdef ARCH_X86_64
- mov r10, r0
+%if ARCH_X86_64
+ mov dst2q, r0
%endif
call h264_idct_add8_mmx_plane
mov r5, 32
add r2, 384
-%ifdef ARCH_X86_64
- add r10, gprsize
+%if ARCH_X86_64
+ add dst2q, gprsize
%else
add r0mp, gprsize
%endif
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
- add r0, [r10]
+ add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
test r6, r6
jz .skipblock
DC_ADD_MMX2_INIT r2, r3, r6
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
- add r0, [r10]
+ add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add8_8_mmx2, 5, 7, 0
+cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
-%ifdef ARCH_X86_64
- mov r10, r0
+%if ARCH_X86_64
+ mov dst2q, r0
%endif
%ifdef PIC
- lea r11, [scan8_mem]
+ lea picregq, [scan8_mem]
%endif
call h264_idct_add8_mmx2_plane
mov r5, 32
add r2, 384
-%ifdef ARCH_X86_64
- add r10, gprsize
+%if ARCH_X86_64
+ add dst2q, gprsize
%else
add r0mp, gprsize
%endif
ALIGN 16
INIT_XMM
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
-x264_add8x4_idct_sse2:
+h264_add8x4_idct_sse2:
movq m0, [r2+ 0]
movq m1, [r2+ 8]
movq m2, [r2+16]
test r0, r0
jz .cycle%1end
mov r0d, dword [r1+%1*8]
-%ifdef ARCH_X86_64
- add r0, r10
+%if ARCH_X86_64
+ add r0, r5
%else
add r0, r0m
%endif
- call x264_add8x4_idct_sse2
+ call h264_add8x4_idct_sse2
.cycle%1end
%if %1 < 7
add r2, 64
; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16_8_sse2, 5, 5, 8
-%ifdef ARCH_X86_64
- mov r10, r0
+cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
+%if ARCH_X86_64
+ mov r5, r0
%endif
; unrolling of the loop leads to an average performance gain of
; 20-25%
test r0, r0
jz .try%1dc
mov r0d, dword [r1+%1*8]
-%ifdef ARCH_X86_64
- add r0, r10
+%if ARCH_X86_64
+ add r0, r7
%else
add r0, r0m
%endif
- call x264_add8x4_idct_sse2
+ call h264_add8x4_idct_sse2
jmp .cycle%1end
.try%1dc
movsx r0, word [r2 ]
or r0w, word [r2+32]
jz .cycle%1end
mov r0d, dword [r1+%1*8]
-%ifdef ARCH_X86_64
- add r0, r10
+%if ARCH_X86_64
+ add r0, r7
%else
add r0, r0m
%endif
; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
-%ifdef ARCH_X86_64
- mov r10, r0
+cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
+%if ARCH_X86_64
+ mov r7, r0
%endif
add16intra_sse2_cycle 0, 0xc
add16intra_sse2_cycle 1, 0x14
movzx r0, word [r4+%2]
test r0, r0
jz .try%1dc
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
- add r0, [r10]
+ add r0, [r7]
%else
mov r0, r0m
mov r0, [r0]
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
%endif
- call x264_add8x4_idct_sse2
+ call h264_add8x4_idct_sse2
jmp .cycle%1end
.try%1dc
movsx r0, word [r2 ]
or r0w, word [r2+32]
jz .cycle%1end
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
- add r0, [r10]
+ add r0, [r7]
%else
mov r0, r0m
mov r0, [r0]
; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
-cglobal h264_idct_add8_8_sse2, 5, 7, 8
+cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
add r2, 512
-%ifdef ARCH_X86_64
- mov r10, r0
+%if ARCH_X86_64
+ mov r7, r0
%endif
add8_sse2_cycle 0, 0x34
add8_sse2_cycle 1, 0x3c
-%ifdef ARCH_X86_64
- add r10, gprsize
+%if ARCH_X86_64
+ add r7, gprsize
%else
add r0mp, gprsize
%endif
%macro IDCT_DC_DEQUANT 2
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
+ ; manually spill XMM registers for Win64 because
+ ; the code here is initialized with INIT_MMX
+ WIN64_SPILL_XMM %2
movq m3, [r1+24]
movq m2, [r1+16]
movq m1, [r1+ 8]
WALSH4_1D 0,1,2,3,4
; shift, tmp, output, qmul
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 0,3,1,2
; we can't avoid this, because r0 is the shift register (ecx) on win64
xchg r0, t2
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
DECLARE_REG_TMP 3,1,0,2
%else
DECLARE_REG_TMP 1,3,0,2