This allows combining multiple conditionals in a single statement.
ifeq ($(ARCH),X86)
ARCH_X86 = yes
ASMSRC = $(X86SRC) common/x86/pixel-32.asm
+ASFLAGS += -DARCH_X86_64=0
endif
ifeq ($(ARCH),X86_64)
ARCH_X86 = yes
ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
-ASFLAGS += -DARCH_X86_64
+ASFLAGS += -DARCH_X86_64=1
endif
ifdef ARCH_X86
cextern cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 3,1,2,0,6,5,4,2
%define pointer resq
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
%define pointer resq
%else
and t4d, t6d
shr t5d, 6
movifnidn t2d, r2m
-%ifdef WIN64
+%if WIN64
PUSH r7
%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2
mov t4d, t3d
shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
-%ifdef WIN64
+%if WIN64
POP r7
%endif
shl t4d, t3b
lea t7d, [t7*2+t3]
mov t3d, [t0+cb.queue]
inc t3d
-%ifdef UNIX64 ; .putbyte compiles to nothing but a jmp
+%if UNIX64 ; .putbyte compiles to nothing but a jmp
jge cabac_putbyte
%else
jge .putbyte
cabac_putbyte:
; alive: t0=cb t3=queue t6=low
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif
mov t1d, -1
mov [rsi], edx
RET
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; int cpu_cpuid_test( void )
SWAP %4, %9, %8
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
cglobal sub8x8_dct8, 3,3,8
SWAP %4, %9, %8
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
cglobal sub8x8_dct8, 3,3,14
-%ifdef WIN64
+%if WIN64
call .skip_prologue
RET
%endif
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2,16
add r1, 128
-%ifdef WIN64
+%if WIN64
call .skip_prologue
RET
%endif
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
-%ifdef WIN64
+%if WIN64
call .skip_prologue
RET
%endif
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
-%ifdef WIN64
+%if WIN64
call .skip_prologue
RET
%endif
cglobal add8x8_idct8, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
-%ifdef WIN64
+%if WIN64
call .skip_prologue
RET
%endif
cglobal add8x8_idct, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
-%ifdef WIN64
+%if WIN64
call .skip_prologue
RET
%endif
SWAP %1, %3
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
RET
%endif ; HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
RET
%endif ; HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
;-----------------------------------------------------------------------------
SUB_DCT4
%endif ; HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 7
cglobal %1, 3,3,%7
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%if mmsize == 8
pxor m7, m7
%else
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
-%ifdef WIN64
+%if WIN64
call %2.skip_prologue
RET
%else
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
cglobal %1, 2,2,%7
%if %3==256
add r1, 128
call %2.skip_prologue
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
-%ifdef WIN64
+%if WIN64
call %2.skip_prologue
RET
%else
%endif
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 64, 8, 0, 0, 0
SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx, 64, 16, 8, 8, 0
SUB_NxN_DCT sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 256, 16, 0, 0, 14
%else ; !HIGH_BIT_DEPTH
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX
SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx, 32, 4, 0, 0, 0
ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx, 32, 4, 0, 0
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
%endif ; HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
;-----------------------------------------------------------------------------
cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
-%ifdef WIN64
+%if WIN64
call .loop
RET
%endif
cglobal add16x16_idct_dc, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
-%ifdef WIN64
+%if WIN64
call .loop
RET
%endif
psubw m0, m1 ; d02-d13 s02-s13 d02+d13 s02+s13
%endmacro
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_MMX
cglobal sub8x8_dct_dc_mmx2, 3,3
DCTDC_2ROW_MMX m0, m4, 0, 0
paddw %1, m0
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SUB8x8_DCT_DC_10 0
cglobal sub8x8_dct_dc, 3,3,3
DCTDC_4ROW_SSE2 m1, 0
RET
%endmacro
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_XMM sse2
SCAN_8x8
INIT_XMM ssse3
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
SCAN_8x8_FRAME 4 , dq, qdq, dq, d
INIT_XMM avx
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
SCAN_4x4 4 , dq, qdq, dq
INIT_XMM avx
RET
%endif ; !HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
;-----------------------------------------------------------------------------
mova [r0+60*SIZEOF_DCTCOEF], m7
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse4
SCAN_8x8 d, dq, qdq, dq, 4
INIT_XMM avx
RET
%endmacro
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_XMM ssse3
ZIGZAG_SUB_4x4 , frame
ZIGZAG_SUB_4x4 ac, frame
packsswb m5, m6
packsswb m5, m5
pxor m0, m0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
packsswb m5, m5
%endif
pcmpeqb m5, m0
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
ZIGZAG_8x8_CAVLC D
INIT_XMM avx
%endif
%endmacro
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro ZIGZAG_8x8_CAVLC 0
cglobal zigzag_interleave_8x8_cavlc, 3,3,8
INTERLEAVE_XMM 0
cextern pw_00ff
cextern pw_pixel_max
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro ABS_SUB 5
RET
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
paddw t0, %3, %2
mova t2, %4
paddw t2, %3
LOAD_AB t0, t1, r2d, r3d
mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mova %2, t0 ; mask0
psrlw t3, %1, 2
%else
%endif
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
psubusb %5, %2, %1
psubusb %4, %1, %2
%else
mova %4, %2
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
-%ifdef WIN64
+%if WIN64
sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
-%ifdef WIN64
+%if WIN64
mov [rsp+0x20], r4
%endif
call deblock_v_luma
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-%ifdef WIN64
+%if WIN64
add rsp, 0x98
%else
add rsp, 0x68
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pavgb t0, p2, p1
pavgb t1, p0, q0
%else
%endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
paddb t2, p2, p1
paddb t3, p0, q0
%else
pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pavgb t1, p2, q1
psubb t2, p2, q1
%else
%define t1 m5
%define t2 m6
%define t3 m7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define p2 m8
%define q2 m9
%define t4 m10
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra, 4,6,16
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
sub esp, 0x60
%endif
lea r4, [r1*4]
mova p0, [r4+r5]
mova q0, [r0]
mova q1, [r0+r1]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pxor mpb_0, mpb_0
mova mpb_1, [pb_1]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
add esp, 0x60
%endif
RET
INIT_MMX cpuname
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
DEBLOCK_LUMA_INTRA v
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_LUMA_INTRA v8
%endif
%endif ; !HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
REP_RET
%endmacro ; DEBLOCK_CHROMA
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro CHROMA_V_START 0
dec r2d ; alpha-1
dec r3d ; beta-1
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_H_CHROMA_420_MBAFF
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_H_CHROMA_420_MBAFF
%endif
%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,8,8
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define cntr r7
%else
%define cntr dword r0m
DEBLOCK_CHROMA_INTRA
INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA_BODY
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
DEBLOCK_CHROMA_INTRA
%endif
; implicit weighted biprediction
;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 0,1,2,3,4,5,4,5
%macro AVG_START 0-1 0
PROLOGUE 5,7,%1
movsxd r5, dword r5m
%endmacro
-%elifdef UNIX64
+%elif UNIX64
DECLARE_REG_TMP 0,1,2,3,4,5,7,8
%macro AVG_START 0-1 0
PROLOGUE 6,9,%1
REP_RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro BIWEIGHT_MMX 2
movh m0, %1
SPLATW m3, m3 ; weight_dst,src
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro BIWEIGHT_ROW 4
BIWEIGHT [%2], [%3]
%if %4==mmsize/4
cglobal pixel_avg_weight_w%1
BIWEIGHT_START
AVG_START %2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m7, [pw_pixel_max]
%endif
.height_loop:
BIWEIGHT [t2], [t4]
SWAP 0, 6
BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
packssdw m6, m0
CLIPW m6, m5, m7
%else ;!HIGH_BIT_DEPTH
AVG_WEIGHT 4
AVG_WEIGHT 8
AVG_WEIGHT 16
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
AVG_WEIGHT 4, 8
AVG_WEIGHT 8, 8
; P frame explicit weighted prediction
;=============================================================================
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro WEIGHT_START 1 ; (width)
mova m0, [r4+ 0] ; 1<<denom
mova m3, [r4+16]
;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define NUMREGS 6
%define LOAD_HEIGHT
%define HEIGHT_REG r5d
%endif
%assign XMMREGS 7
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%assign NUMREGS NUMREGS+1
%assign XMMREGS 8
%endif
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
WEIGHTER 12
INIT_XMM avx
WEIGHTER 8
%macro OFFSET_OP 7
mov%6 m0, [%1]
mov%6 m1, [%2]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
p%5usw m0, m2
p%5usw m1, m2
%ifidn %5,add
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
%assign x (x+mmsize)
%else
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
%else
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
FIX_STRIDES r1, r3
mova m2, [r4]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%ifidn %2,add
mova m3, [pw_pixel_max]
%endif
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
OFFSETPN 8
INIT_XMM avx
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
%2 m0, [t2+x]
%2 m1, [t2+x+SIZEOF_PIXEL*t3]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pavgw m0, [t4+x]
pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
%else ;!HIGH_BIT_DEPTH
AVG_END
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX mmx2
AVG_FUNC 4, movq, movq
; pixel avg2
;=============================================================================
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_avg2_wN( uint16_t *dst, int dst_stride,
; uint16_t *src1, int src_stride,
REP_RET
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
; uint8_t *src1, int src_stride,
%endif
%if 0 ; or %1==8 - but the extra branch seems too expensive
ja cachesplit
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
test r4b, 1
%else
test byte r4m, 1
INIT_MMX
AVG_CACHELINE_CHECK 8, 64, mmx2
AVG_CACHELINE_CHECK 12, 64, mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
AVG_CACHELINE_CHECK 16, 64, mmx2
AVG_CACHELINE_CHECK 20, 64, mmx2
AVG_CACHELINE_CHECK 8, 32, mmx2
%else
lea r6, [avg_w16_addr + r6]
%endif
-%ifdef UNIX64
+%if UNIX64
jmp r6
%else
call r6
lea r5, [r3*3]
lea r4, [r1*3]
je .end
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%define mova movd
%define movu movd
%endif
;-----------------------------------------------------------------------------
%macro PREFETCH_FENC 1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
cglobal prefetch_fenc_%1, 5,5
FIX_STRIDES r1d, r3d
and r4d, 3
; chroma MC
;=============================================================================
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
DECLARE_REG_TMP 6,7,8
%else
DECLARE_REG_TMP 0,1,2
%endif
%macro MC_CHROMA_START 1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
PROLOGUE 0,9,%1
%else
PROLOGUE 0,6,%1
add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 4
movu %1, [%4+0]
movu %2, [%4+4]
MC_CHROMA_START 0
FIX_STRIDES r4
and r5d, 7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
jz .mc1dy
%endif
and t2d, 7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
jz .mc1dx
%endif
shl r5d, 16
pshufw m5, m5, q1111
jge .width4
%else
-%ifdef WIN64
+%if WIN64
cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
%endif
pshufd m7, m5, q1111
pshufd m5, m5, q1111
jg .width8
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
add r2, r2
UNPACK_UNALIGNED m0, m1, m2, r3
%else
SWAP 3, 0
ALIGN 4
.loop2:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
UNPACK_UNALIGNED m0, m1, m2, r3+r4
pmullw m3, m6
%else ; !HIGH_BIT_DEPTH
pmullw m0, m5
paddw m0, m2
psrlw m0, 6
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movh [r0], m0
%if mmsize == 8
psrlq m0, 32
%if mmsize==8
.width4:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov t0, r0
mov t1, r1
mov t2, r3
%endif
%else
.width8:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define multy0 m8
SWAP 8, 5
%else
%endif
FIX_STRIDES r2
.loopx:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
UNPACK_UNALIGNED m0, m2, m4, r3
UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
%else
add r3, r4
ALIGN 4
.loop4:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
UNPACK_UNALIGNED m0, m1, m2, r3
pmaddwd m0, m7
pmaddwd m1, m7
paddw m1, m3
psrlw m0, 6
psrlw m1, 6
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movh [r0], m0
movh [r0+mmsize/2], m1
%if mmsize==8
jg .width8
REP_RET
.width8:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
lea r3, [t2+8*SIZEOF_PIXEL]
lea r0, [t0+4*SIZEOF_PIXEL]
lea r1, [t1+4*SIZEOF_PIXEL]
jmp .loopx
%endif
-%ifdef ARCH_X86_64 ; too many regs for x86_32
+%if ARCH_X86_64 ; too many regs for x86_32
RESET_MM_PERMUTATION
-%ifdef WIN64
+%if WIN64
%if xmm_regs_used > 6
%assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
%assign xmm_regs_used 6
movd m5, r5d
mov r6d, 2*SIZEOF_PIXEL
.mc1d:
-%ifdef HIGH_BIT_DEPTH
-%if mmsize == 16
+%if HIGH_BIT_DEPTH && mmsize == 16
WIN64_SPILL_XMM 8
-%endif
%endif
mova m4, [pw_8]
SPLATW m5, m5
shr r5d, 1
%endif
.loop1d_w4:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if mmsize == 8
movq m0, [r3+0]
movq m2, [r3+8]
paddw m2, m3
psrlw m0, 3
psrlw m2, 3
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if mmsize == 8
xchg r4, r8
xchg r2, r7
pshufb m0, m5
movu m1, [r3+8]
pshufb m1, m5
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
SWAP 8, 6
%define mult1 m8
%else
REP_RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
INIT_XMM sse2
;%define movntps movaps
;%define sfence
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
cglobal hpel_filter_v, 5,6,11
FIX_STRIDES r3d, r4d
-%ifdef WIN64
+%if WIN64
movsxd r4, r4d
%endif
lea r5, [r1+r3]
HPEL_FILTER
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro HPEL_V 1
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_v, 5,6,%1
-%ifdef WIN64
+%if WIN64
movsxd r4, r4d
%endif
lea r5, [r1+r3]
%ifnidn cpuname, sse2
mova m7, [pw_32]
%define tpw_32 m7
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
mova m8, [pw_32]
%define tpw_32 m8
%else
jl .loop
REP_RET
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
HPEL_V 8
INIT_XMM sse2, misalign
HPEL_C
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_XMM sse2
HPEL_C
INIT_XMM ssse3
HPEL_V 0
%endif
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%macro DO_FILT_V 5
;The optimum prefetch distance is difficult to determine in checkasm:
;any prefetch seems slower than not prefetching.
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,9,16
-%ifdef WIN64
+%if WIN64
movsxd r4, r4d
movsxd r5, r5d
%endif
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%assign x 0
%rep 16/mmsize
mov%4 m0, [%2+(x/2)*mmsize]
%endmacro
%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%assign n 0
%rep 16/mmsize
mova m0, [%3+(n+0)*mmsize]
; assumes i_dst and w are multiples of 16, and i_dst>2*w
cglobal plane_copy_interleave_core, 7,9
FIX_STRIDES r1d, r3d, r5d, r6d
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov r1m, r1d
mov r3m, r3d
mov r6m, r6d
lea r0, [r0+r6*2]
add r2, r6
add r4, r6
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
DECLARE_REG_TMP 7,8
%else
DECLARE_REG_TMP 1,3
%endmacro ; PLANE_INTERLEAVE
%macro DEINTERLEAVE_START 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m4, [pd_ffff]
%elif cpuflag(ssse3)
mova m4, [deinterleave_shuf]
DEINTERLEAVE_START
mov r6d, r6m
FIX_STRIDES r1d, r3d, r5d, r6d
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov r6m, r6d
%endif
movsxdifnidn r1, r1d
REP_RET
%endmacro ; PLANE_DEINTERLEAVE
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX mmx2
PLANE_INTERLEAVE
INIT_MMX mmx
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
shl dword r6m, 1
FIX_STRIDES r5d
shl dword r7m, 1
%endif
-%ifdef WIN64
+%if WIN64
movsxd r5, r5d
%endif
; src += 2*(height-1)*stride + 2*width
shl r6d, 1
PUSH r6
%define src_gap [rsp]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pcmpeqw m7, m7
psrld m7, 16
.vloop:
INIT_MMX mmx2
FRAME_INIT_LOWRES
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX cache32, mmx2
FRAME_INIT_LOWRES
%endif
; SSD
;=============================================================================
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
SSD_ONE 16, 16
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro SSD_LOAD_FULL 5
mova m1, [t0+%1]
mova m2, [t2+%2]
%else
.startloop:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3
PROLOGUE 0,0,8
%else
; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
; distortion levels it will take much more than that though.
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SSD_NV12 0
cglobal pixel_ssd_nv12_core, 6,7,7
shl r4d, 2
%endmacro ; SSD_NV12
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
%macro VAR_START 1
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
%else
%endmacro
%macro VAR_END 2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
movd eax, m5
HADDD m6, m1
movd edx, m6
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
%macro VAR_2ROW 2
mov r2d, %2
.loop:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+%1]
%else
add r0, r1
%endif
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
punpcklbw m3, m7
punpckhbw m4, m7
%endif ; !HIGH_BIT_DEPTH
VAR_2ROW r1, 4
VAR_END 8, 8
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
VAR_START 0
mov r5d, %1
.loop:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
psubw m0, [r2]
VAR2_END %2
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
VAR2_8x8_MMX 8, 6
VAR2_8x8_MMX 16, 7
VAR_START 1
mov r5d, %1/2
.loop:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1*2]
mova m2, [r2]
VAR2_8x8_SSE2 8, 6
VAR2_8x8_SSE2 16, 7
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
%endmacro
%macro SATD_END_MMX 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
movd eax, m0
%else ; !HIGH_BIT_DEPTH
paddw m0, m1
ret
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro SATD_MxN_MMX 3
cglobal pixel_satd_%1x%2, 4,7
SATD_START_MMX
SATD_MxN_MMX 8, 16, 8
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
cglobal pixel_satd_16x16, 4,6
SATD_START_MMX
pxor m0, m0
%endmacro
%macro BACKUP_POINTERS 0
-%ifdef ARCH_X86_64
-%ifdef WIN64
+%if ARCH_X86_64
+%if WIN64
PUSH r7
%endif
mov r6, r0
%endmacro
%macro RESTORE_AND_INC_POINTERS 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
lea r0, [r6+8]
lea r2, [r7+8]
-%ifdef WIN64
+%if WIN64
POP r7
%endif
%else
SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
ret
-%ifdef UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+%if UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
%endmacro ; SATDS_SSE2
%macro SA8D_INTER 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define lh m10
%define rh m0
%else
%define lh m0
%define rh [esp+48]
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
paddd lh, rh
%else
%endmacro
%macro SA8D 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%define vertical 1
%else ; sse2 doesn't seem to like the horizontal way of doing things
%define vertical (cpuflags == cpuflags_sse2)
%endif
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
mova m7, [hmul_8p]
%endif
call pixel_sa8d_8x8_internal
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
call pixel_sa8d_8x8_internal ; pix[0]
add r2, 8*SIZEOF_PIXEL
add r0, 8*SIZEOF_PIXEL
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova m10, m0
call pixel_sa8d_8x8_internal ; pix[8*stride]
SA8D_INTER
SWAP 0, 10
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
HADDUW m0, m1
%endif
movd eax, m0
lea r4, [3*r1]
lea r5, [3*r3]
call pixel_sa8d_8x8_internal
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%else
HADDW m0, m1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
HADDUW m0, m1
%endif
mova [esp+48], m0
%endif
mova [esp+64-mmsize], m0
call pixel_sa8d_8x8_internal
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
SA8D_INTER
%else ; !HIGH_BIT_DEPTH
paddusw m0, [esp+64-mmsize]
; intra_sa8d_x3_8x8 and intra_satd_x3_4x4 are obsoleted by x9 on ssse3+,
; and are only retained for old cpus.
%macro INTRA_SA8D_SSE2 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hadamard_load
; not really a global, but otherwise cycles get attributed to the wrong function in profiling
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0+0*FENC_STRIDEB]
mova m1, [r0+1*FENC_STRIDEB]
mova m2, [r0+2*FENC_STRIDEB]
%macro SCALAR_HADAMARD 4-5 ; direction, offset, 3x tmp
%ifidn %1, top
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
%else
movd %3, [r1+%2*SIZEOF_PIXEL-FDEC_STRIDEB]
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+0*FDEC_STRIDEB], 0
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+2*FDEC_STRIDEB], 2
pinsrw %3, [r1+%2*SIZEOF_PIXEL-2+3*FDEC_STRIDEB], 3
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
psrlw %3, 8
%endif
%ifnidn %2, 0
; void intra_satd_x3_4x4( uint8_t *fenc, uint8_t *fdec, int *res )
;-----------------------------------------------------------------------------
cglobal intra_satd_x3_4x4, 3,3
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
; stack is 16 byte aligned because abi says so
%define top_1d rsp-8 ; size 8
%define left_1d rsp-16 ; size 8
movd [r2+0], m0 ; i4x4_v satd
movd [r2+4], m4 ; i4x4_h satd
movd [r2+8], m5 ; i4x4_dc satd
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
ADD esp, 16
%endif
RET
mova [sums+ 0], m7
mova [sums+ 8], m7
mova [sums+16], m7
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova [sums+24], m7
mova [sums+32], m7
mova [sums+40], m7
add r0, 4*SIZEOF_PIXEL
inc r4
jl .loop_x
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m7, [pw_1]
pmaddwd m4, m7
pmaddwd m0, m7
; horizontal sum
movifnidn r2, r2mp
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m1, m5
paddd m5, m3
HADDD m5, m7 ; DC satd
ADD rsp, stack_pad
RET
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define t0 r6
%else
%define t0 r2
movq m1, [sums+8]
movq m2, [sums+16]
movq m7, m0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
psrlq m7, 16
HADDW m7, m3
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, paddd
%assign pad 0xc0-gprsize-(stack_offset&15)
%define pred_buf rsp
sub rsp, pad
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
INTRA_X9_PRED intrax9a, m8
%else
INTRA_X9_PRED intrax9a, [rsp+0xa0]
paddd m2, m3
paddd m4, m5
paddd m6, m7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
SWAP 7, 8
pxor m8, m8
%define %%zero m8
RET
%endif ; cpuflag
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_satd_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
%define fenc13 m5
%define fenc46 m6
%define fenc57 m7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define tmp m8
%assign padbase 0x0
%else
ADD rsp, pad
RET
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
; int intra_sa8d_x9_8x8( uint8_t *fenc, uint8_t *fdec, uint8_t edge[36], uint16_t *bitcosts, uint16_t *satds )
;-----------------------------------------------------------------------------
; out: [tmp]=hadamard4, m0=satd
INIT_MMX mmx2
cglobal hadamard_ac_4x4
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1]
mova m2, [r0+r1*2]
ABSW2 m1, m3, m1, m3, m4, m5
HADAMARD 0, max, 0, 2, 4, 5
HADAMARD 0, max, 1, 3, 4, 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmaddwd m0, m7
pmaddwd m1, m7
paddd m6, m0
ret
%macro AC_PREP 2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmaddwd %1, %2
%endif
%endmacro
%macro AC_PADD 3
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
AC_PREP %2, %3
paddd %1, %2
%else
cglobal hadamard_ac_8x8
mova m6, [mask_ac4]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m7, [pw_1]
%else
pxor m7, m7
AC_PADD m5, m0, m7
sub r3, 40
mova [rsp+gprsize+8], m5 ; save satd
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pxor m6, m6
%endif
%rep 3
ABSW2 m1, m3, m1, m3, m4, m5
ABSW2 m0, m2, m0, m2, m4, m5
HADAMARD 0, max, 1, 3, 4, 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pand m0, [mask_ac4]
pmaddwd m1, m7
pmaddwd m0, m7
%macro HADAMARD_AC_WXH_SUM_MMX 2
mova m1, [rsp+1*mmsize]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if %1*%2 >= 128
paddd m0, [rsp+2*mmsize]
paddd m1, [rsp+3*mmsize]
movd edx, m0
movd eax, m1
shr edx, 1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
HADAMARD_AC_WXH_MMX 8, 8
%macro LOAD_INC_8x4W_SSE2 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movu m%1, [r0]
movu m%2, [r0+r1]
movu m%3, [r0+r1*2]
; in: r0=pix, r1=stride, r2=stride*3
; out: [esp+16]=sa8d, [esp+32]=satd, r0+=stride*4
cglobal hadamard_ac_8x8
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define spill0 m8
%define spill1 m9
%define spill2 m10
%define spill1 [rsp+gprsize+16]
%define spill2 [rsp+gprsize+32]
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%define vertical 1
%elif cpuflag(ssse3)
%define vertical 0
AC_PREP m2, [pw_1]
AC_PADD m2, m3, [pw_1]
AC_PADD m2, m1, [pw_1]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
paddd m2, m2
%else
paddw m2, m2
%macro HADAMARD_AC_WXH_SUM_SSE2 2
mova m1, [rsp+2*mmsize]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if %1*%2 >= 128
paddd m0, [rsp+3*mmsize]
paddd m1, [rsp+4*mmsize]
movd eax, m1
shr edx, 2 - (%1*%2 >> 8)
shr eax, 1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shl rdx, 32
add rax, rdx
%endif
; instantiate satds
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
cextern pixel_sa8d_8x8_internal_mmx2
INIT_MMX mmx2
SA8D
INIT_XMM sse2
SA8D
SATDS_SSE2
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INTRA_SA8D_SSE2
%endif
INIT_MMX mmx2
%define DIFFOP DIFF_SUMSUB_SSSE3
%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%define LOAD_INC_8x4W LOAD_INC_8x4W_SSSE3
%define LOAD_SUMSUB_8x4P LOAD_SUMSUB_8x4P_SSSE3
%define LOAD_SUMSUB_16P LOAD_SUMSUB_16P_SSSE3
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
%endif
%undef movdqa ; nehalem doesn't like movaps
%undef movdqu ; movups
%undef punpcklqdq ; or movlhps
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_MMX ssse3
INTRA_X3_MMX
%endif
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
%endif
INIT_XMM avx
SATDS_SSE2
SA8D
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
%endif
INIT_XMM xop
SATDS_SSE2
SA8D
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
%endif
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
%macro SSIM_ITER 1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movdqu m5, [r0+(%1&1)*r1]
movdqu m6, [r2+(%1&1)*r3]
%else
punpckhdq m5, m3, m4
punpckldq m3, m4
-%ifdef UNIX64
+%if UNIX64
%define t0 r4
%else
%define t0 rax
addps m0, m4
pshuflw m4, m0, q0032
addss m0, m4
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
movd r0m, m0
fld dword r0m
%endif
;=============================================================================
%macro ADS_START 0
-%ifdef WIN64
+%if WIN64
movsxd r5, r5d
%endif
mov r0d, r5d
punpcklqdq xmm6, xmm6
punpckhqdq xmm5, xmm5
punpckhqdq xmm4, xmm4
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pshuflw xmm8, r6m, 0
punpcklqdq xmm8, xmm8
ADS_START
jge .end
.loopi:
mov r2, [r6+r1]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
test r2, r2
%else
mov r3, r2
TEST 1
TEST 2
TEST 3
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
shr r2, 32
%else
mov r2d, [r6+r1]
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS 4-5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
paddw %2, %3
psrlw %2, 1
pavgw %1, %4, %2
movu m1, [r0-FDEC_STRIDEB]
PSLLPIX m2, m1, 1
mova m0, m1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
PSRLPIX m1, m1, 1
pshufhw m1, m1, q2210
%else
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_4x4_DDL
INIT_XMM avx
;-----------------------------------------------------------------------------
; void predict_4x4_vr( pixel *src )
;-----------------------------------------------------------------------------
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_MMX ssse3
cglobal predict_4x4_vr, 1,1
movd m1, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
;-----------------------------------------------------------------------------
%macro PREDICT_4x4 4
cglobal predict_4x4_ddr, 1,1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movu m2, [r0-1*FDEC_STRIDEB-8]
pinsrw m2, [r0+0*FDEC_STRIDEB-2], 2
pinsrw m2, [r0+1*FDEC_STRIDEB-2], 1
; void predict_4x4_vr( pixel *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_vr, 1,1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movu m1, [r0-1*FDEC_STRIDEB-8]
pinsrw m1, [r0+0*FDEC_STRIDEB-2], 2
pinsrw m1, [r0+1*FDEC_STRIDEB-2], 1
; void predict_4x4_hd( pixel *src )
;-----------------------------------------------------------------------------
cglobal predict_4x4_hd, 1,1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movu m1, [r0-1*FDEC_STRIDEB-8]
PSLLPIX m1, m1, 1
pinsrw m1, [r0+0*FDEC_STRIDEB-2], 3
;-----------------------------------------------------------------------------
; void predict_4x4_ddr( pixel *src )
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX mmx2
cglobal predict_4x4_ddr, 1,1
mova m0, [r0+1*FDEC_STRIDEB-8]
;-----------------------------------------------------------------------------
; void predict_4x4_hu( pixel *src )
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_hu_mmx2, 1,1
movq m0, [r0+0*FDEC_STRIDEB-8]
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_4x4_V1 w
INIT_XMM avx
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
cglobal predict_4x4_dc, 1,1
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
cglobal predict_8x8_filter, 4,6,6
add r0, 0x58*SIZEOF_PIXEL
%define src r0-0x58*SIZEOF_PIXEL
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
mov r4, r1
%define t1 r4
%define t4 r1
%endif
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_FILTER w, d, q, dq
INIT_XMM ssse3
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_V
%else
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_H wd, D
%else
;-----------------------------------------------------------------------------
; void predict_8x8_dc( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
cglobal predict_8x8_dc, 2,2
movu m0, [r1+14]
; void predict_8x8_dc_top ( pixel *src, pixel *edge );
; void predict_8x8_dc_left( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 3
cglobal %1, 2,2
%3 m0, [r1+%2]
RET
%endmacro ; PREDICT_8x8_DDLR
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_DDLR
INIT_XMM ssse3
PREDICT_8x8_DDLR
INIT_XMM ssse3, cache64
PREDICT_8x8_DDLR
-%elifndef ARCH_X86_64
+%elif ARCH_X86_64 == 0
INIT_MMX mmx2
PREDICT_8x8_DDLR
%endif
%macro PREDICT_8x8_HU 2
cglobal predict_8x8_hu, 2,2,8
add r0, 4*FDEC_STRIDEB
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if cpuflag(ssse3)
movu m5, [r1+7*SIZEOF_PIXEL]
pshufb m5, [pw_reverse]
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_HU d, wd
INIT_XMM ssse3
PREDICT_8x8_HU d, wd
INIT_XMM avx
PREDICT_8x8_HU d, wd
-%elifndef ARCH_X86_64
+%elif ARCH_X86_64 == 0
INIT_MMX mmx2
PREDICT_8x8_HU w, bw
%endif
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_VR w
INIT_XMM ssse3
PREDICT_8x8_VR w
INIT_XMM avx
PREDICT_8x8_VR w
-%elifndef ARCH_X86_64
+%elif ARCH_X86_64 == 0
INIT_MMX mmx2
PREDICT_8x8_VR b
%endif
%macro LOAD_PLANE_ARGS 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movd mm0, r1d
movd mm2, r2d
movd mm4, r3d
;-----------------------------------------------------------------------------
; void predict_8x8c_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-%ifndef ARCH_X86_64
-%ifndef HIGH_BIT_DEPTH
+%if ARCH_X86_64 == 0 && HIGH_BIT_DEPTH == 0
%macro PREDICT_CHROMA_P_MMX 1
cglobal predict_8x%1c_p_core, 1,2
LOAD_PLANE_ARGS
INIT_MMX mmx2
PREDICT_CHROMA_P_MMX 8
PREDICT_CHROMA_P_MMX 16
-%endif ; !HIGH_BIT_DEPTH
-%endif ; !ARCH_X86_64
+%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
%macro PREDICT_CHROMA_P_XMM 1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
cglobal predict_8x%1c_p_core, 1,2,7
movd m0, r1m
movd m2, r2m
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
cglobal predict_16x16_p_core, 1,2
LOAD_PLANE_ARGS
SPLATW m2, m2, 0
pmullw m3, m1, [pw_76543210]
psllw m1, 3
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pxor m6, m6
mov r1d, 16
.loop:
INIT_XMM sse2
PREDICT_16x16_P
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_XMM avx
PREDICT_16x16_P
%endif
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro PREDICT_8x8 0
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( uint8_t *src, uint8_t *edge )
mova [r0+3*FDEC_STRIDEB], m1
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_VL_10 w
INIT_XMM ssse3
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8_HD w, wd
INIT_XMM ssse3
PREDICT_8x8_HD
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
; void predict_8x8_hu( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x8C_V
%else
PREDICT_8x8C_V
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_8x8c_v_mmx, 1,1
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
PREDICT_8x16C_V
%else
;-----------------------------------------------------------------------------
; void predict_8x8c_h( uint8_t *src )
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro PREDICT_C_H 1
cglobal predict_8x%1c_h, 1,1
%macro PREDICT_8x8C_DC 0
cglobal predict_8x8c_dc, 1,3
pxor m7, m7
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movq m0, [r0-FDEC_STRIDEB+0]
movq m1, [r0-FDEC_STRIDEB+8]
HADDW m0, m2
paddw m0, m3
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if cpuflag(sse2)
movq2dq xmm0, m0
punpcklwd xmm0, xmm0
INIT_MMX mmx2
PREDICT_8x8C_DC
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX sse2
PREDICT_8x8C_DC
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro STORE_4LINES 3
%if cpuflag(sse2)
movdqa [r0+FDEC_STRIDEB*(%3-4)], %1
%macro PREDICT_8x16C_DC 0
cglobal predict_8x16c_dc, 1,3
pxor m7, m7
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movq m0, [r0-FDEC_STRIDEB+0]
movq m1, [r0-FDEC_STRIDEB+8]
HADDW m0, m2
psrlw m1, 2
pavgw m0, m7
pavgw m1, m7
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if cpuflag(sse2)
movq2dq xmm0, m0
movq2dq xmm1, m1
INIT_MMX mmx2
PREDICT_8x16C_DC
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX sse2
PREDICT_8x16C_DC
%endif
%macro PREDICT_C_DC_TOP 1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM
cglobal predict_8x%1c_dc_top_sse2, 1,1
pxor m2, m2
;-----------------------------------------------------------------------------
; void predict_16x16_v( pixel *src )
;-----------------------------------------------------------------------------
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx2, 1,2
mova m0, [r0 - FDEC_STRIDEB+ 0]
%macro PREDICT_16x16_H 0
cglobal predict_16x16_h, 1,2
mov r1, 12*FDEC_STRIDEB
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
.vloop:
%assign Y 0
%rep 4
INIT_MMX mmx2
PREDICT_16x16_H
INIT_XMM sse2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
PREDICT_16x16_H
%else
;no SSE2 for 8-bit, it's slower than MMX on all systems that don't support SSSE3
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0 - FDEC_STRIDEB+ 0]
paddw m0, [r0 - FDEC_STRIDEB+ 8]
paddw m0, [r0 - FDEC_STRIDEB+16]
INIT_MMX mmx2
cglobal predict_16x16_dc_core, 1,2
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movd m6, r1d
PRED16x16_DC m6, 5
%else
REP_RET
INIT_MMX mmx2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
;-----------------------------------------------------------------------------
%macro PRED16x16_DC_SSE2 2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m0, [r0 - FDEC_STRIDEB+ 0]
paddw m0, [r0 - FDEC_STRIDEB+16]
HADDW m0, m2
REP_RET
INIT_XMM sse2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
%macro QUANT_DC_START 0
movd m6, r1m ; mf
movd m7, r2m ; bias
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
SPLATD m6, m6
SPLATD m7, m7
%elif cpuflag(sse4) ; ssse3, but not faster on conroe
setne al
%else ; !sse4
xor eax, eax
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%if mmsize == 16
packsswb m5, m5
%endif
%endif ; cpuflag
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro QUANT_ONE_DC 4
%if cpuflag(sse4)
mova m0, [%1]
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro QUANT_ONE 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
INIT_MMX mmx2
QUANT_DC quant_2x2_dc, 1
-%ifndef ARCH_X86_64 ; not needed because sse2 is faster
+%if ARCH_X86_64 == 0 ; not needed because sse2 is faster
QUANT_DC quant_4x4_dc, 4
INIT_MMX mmx
QUANT_AC quant_4x4, 4
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; m2 i_qbits
mova m0, %2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmaddwd m0, %1
pslld m0, m2
%else
;;; m3 f
;;; m4 0
mova m0, %1
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pmadcswd m0, m0, %2, m3
psrad m0, m2
%else
%endrep
%endmacro
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 6,3,2
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
DECLARE_REG_TMP 4,3,2
%else
DECLARE_REG_TMP 2,0,1
sub t2d, t1d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
add r1, t2 ; dequant_mf[i_mf]
%else
add r1, r1mp ; dequant_mf[i_mf]
psrld m3, 1
DEQUANT_LOOP DEQUANT32_R, %1*%1/4, %3
-%ifndef HIGH_BIT_DEPTH
-%if notcpuflag(avx)
+%if HIGH_BIT_DEPTH == 0 && notcpuflag(avx)
cglobal dequant_%1x%1_flat16, 0,3
movifnidn t2d, r2m
%if %1 == 8
DEQUANT16_FLAT [r1+32], 32, 96
%endif
RET
-%endif ; !AVX
-%endif ; !HIGH_BIT_DEPTH
+%endif ; !HIGH_BIT_DEPTH && !AVX
%endmacro ; DEQUANT
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
%else
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx
DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
psrld m4, 1
movd m2, [r1]
%assign x 0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pshufd m2, m2, 0
%rep SIZEOF_PIXEL*32/mmsize
mova m0, [r0+x]
RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT_DC d, pmaddwd
INIT_XMM xop
DEQUANT_DC d, pmaddwd
%else
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEQUANT_DC w, pmullw
%endif
%endif
; t4 is eax for return value.
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,6,4 ; Identical for both Windows and *NIX
%else
DECLARE_REG_TMP 4,1,2,3,0,5
%if cpuflag(sse4)
%assign %%regs %%regs-1
%endif
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
cglobal optimize_chroma_2x2_dc, 0,%%regs,7
REP_RET
%endmacro
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
INIT_XMM sse2
OPTIMIZE_CHROMA_2x2_DC
INIT_XMM ssse3
OPTIMIZE_CHROMA_2x2_DC
%endif ; !HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
REP_RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx
DENOISE_DCT
%endif
REP_RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx
DENOISE_DCT
%endif
%macro DECIMATE_MASK 5
%if mmsize==16
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movdqa xmm0, [%3+ 0]
movdqa xmm1, [%3+32]
packssdw xmm0, [%3+16]
pmovmskb %2, xmm0
%else ; mmsize==8
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movq mm0, [%3+ 0]
movq mm1, [%3+16]
movq mm2, [%3+32]
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE4x4 15
DECIMATE4x4 16
%macro DECIMATE8x8 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
cglobal decimate_score64, 1,5
%ifdef PIC
lea r4, [decimate_table8]
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE8x8
%endif
%endif
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro LAST_MASK 3-4
%if %1 == 4
movq mm0, [%3]
RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
COEFF_LAST8
%endif
%endmacro
%macro COEFF_LAST48 0
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
cglobal coeff_last4, 1,1
BSR rax, [r0], 0x3f
shr eax, 4
BSR eax, r1d, 0x1f
RET
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
cglobal coeff_last64, 1, 5-mmsize/16
pxor m2, m2
LAST_MASK 16, r2d, r0+SIZEOF_DCTCOEF* 32, r4d
%endif
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
COEFF_LAST
%endif
;-----------------------------------------------------------------------------
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
-%ifdef WIN64
+%if WIN64
DECLARE_REG_TMP 3,1,2,0,4,5,6
-%elifdef ARCH_X86_64
+%elif ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6
%else
DECLARE_REG_TMP 6,3,2,1,4,5,0
mov [t1], t4d
.loop:
LZCOUNT t3d, t5d, 0x1f
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
mov [t1+t6+8+16*4], t3b
mov [t1+t6*4+ 8], t2d
%endmacro
INIT_MMX mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
%endif
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
INIT_XMM sse2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
; void pixel_vsad( pixel *src, int stride );
;-----------------------------------------------------------------------------
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX
cglobal pixel_vsad_mmx2, 3,3
mova m0, [r0]
%endmacro
%macro SAD_X3_END 0
-%ifdef UNIX64
+%if UNIX64
movd [r5+0], mm0
movd [r5+4], mm1
movd [r5+8], mm2
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3_mmx2, %1+2, %1+2
-%ifdef WIN64
+%if WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
%endif
movq xmm7, [r0]
movq xmm4, [r1]
movq xmm5, [r2]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movq xmm6, [r3]
movq xmm8, [r4]
movhps xmm7, [r0+FENC_STRIDE]
movu xmm4, [r1+%2]
movu xmm5, [r2+%2]
movu xmm6, [r3+%2]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movu xmm8, [r4+%2]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
-%ifdef UNIX64
+%if UNIX64
movd [r5+0], xmm0
movd [r5+4], xmm1
movd [r5+8], xmm2
;-----------------------------------------------------------------------------
%macro SAD_X_SSE2 3
cglobal pixel_sad_x%1_%2x%3, 2+%1,2+%1,9
-%ifdef WIN64
+%if WIN64
%assign i %1+1
movsxd r %+ i, r %+ i %+ d
%endif
CHECK_SPLIT r3m, %1, %3
jmp pixel_sad_x3_%1x%2_%4
.split:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
PROLOGUE 6,9
-%ifdef WIN64
+%if WIN64
movsxd r4, r4d
sub rsp, 8
%endif
mov r8, r5
call pixel_sad_%1x%2_cache%3_%5
mov [r8], eax
-%ifdef WIN64
+%if WIN64
mov r2, [rsp]
%else
pop r2
mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
mov [r8+4], eax
-%ifdef WIN64
+%if WIN64
mov r2, [rsp+8]
%else
pop r2
mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
mov [r8+8], eax
-%ifdef WIN64
+%if WIN64
add rsp, 24
%endif
RET
CHECK_SPLIT r4m, %1, %3
jmp pixel_sad_x4_%1x%2_%4
.split:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
PROLOGUE 6,9
mov r8, r6mp
-%ifdef WIN64
+%if WIN64
movsxd r5, r5d
%endif
push r4
mov r7, r0
call pixel_sad_%1x%2_cache%3_%5
mov [r8], eax
-%ifdef WIN64
+%if WIN64
mov r2, [rsp]
%else
pop r2
mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
mov [r8+4], eax
-%ifdef WIN64
+%if WIN64
mov r2, [rsp+8]
%else
pop r2
mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
mov [r8+8], eax
-%ifdef WIN64
+%if WIN64
mov r2, [rsp+16]
%else
pop r2
mov r0, r7
call pixel_sad_%1x%2_cache%3_%5
mov [r8+12], eax
-%ifdef WIN64
+%if WIN64
add rsp, 24
%endif
RET
; instantiate the aligned sads
INIT_MMX
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
SAD16_CACHELINE_FUNC_MMX2 8, 32
SAD16_CACHELINE_FUNC_MMX2 16, 32
SAD8_CACHELINE_FUNC_MMX2 4, 32
SAD8_CACHELINE_FUNC_MMX2 8, 64
SAD8_CACHELINE_FUNC_MMX2 16, 64
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
SADX34_CACHELINE_FUNC 16, 16, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 16, 8, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 8, 16, 32, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 8, 16, 64, mmx2, mmx2, mmx2
SADX34_CACHELINE_FUNC 8, 8, 64, mmx2, mmx2, mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
SAD16_CACHELINE_FUNC sse2, 8
SAD16_CACHELINE_FUNC sse2, 16
%assign i 1
HADDW m1, m4
HADDW m2, m5
%endif
-%ifdef UNIX64
+%if UNIX64
movd [r5+0], m0
movd [r5+4], m1
movd [r5+8], m2
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
%assign regnum %1+1
%xdefine STRIDE r %+ regnum
-%ifdef WIN64
+%if WIN64
movsxd STRIDE, STRIDE %+ d
%endif
mov r6, %3/2-1
%if cpuflag(ssse3)
pabsd m%1, m%1
pmuludq m%1, m%1
-%elifdef HIGH_BIT_DEPTH
+%elif HIGH_BIT_DEPTH
ABSD m%2, m%1
SWAP %1, %2
pmuludq m%1, m%1
%assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
SUB rsp, pad
DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
-%ifdef WIN64
+%if WIN64
%define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
%else
%define level_statem rsp+stack_offset+32
%define zigzagm [stack+8]
mov last_nnzm, iid
mov zigzagm, zigzagq
-%ifndef WIN64
+%if WIN64 == 0
%define orig_coefsm [stack+16]
%define quant_coefsm [stack+24]
mov orig_coefsm, orig_coefsq
movzx r0, word [level_tree + r0*4]
psrld m0, 16
movd m1, [dctq + r2*SIZEOF_DCTCOEF]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
psignd m0, m1
movd [dctq + r2*SIZEOF_DCTCOEF], m0
%else
%endif
%else
mov r5d, [level_tree + r0*4]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
%else
movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
shr r5d, 16
xor r5d, r4d
sub r5d, r4d
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov [dctq + r2*SIZEOF_DCTCOEF], r5d
%else
mov [dctq + r2*SIZEOF_DCTCOEF], r5w
pxor m0, m0
mova [r10+ 0], m0
mova [r10+16], m0
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova [r10+32], m0
mova [r10+48], m0
%endif
.i_loop%1:
; if( !quant_coefs[i] )
mov r6, quant_coefsm
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
%else
movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
movzx zigzagid, byte [zigzagq+iiq]
movd m0, abs_leveld
mov r6, orig_coefsm
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
; ssd1[k] -= psy_weight * psy_value;
mov r6, fenc_dctm
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
%define program_name x264
-%ifdef ARCH_X86_64
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
%ifidn __OUTPUT_FORMAT__,win32
- %define WIN64
+ %define WIN64 1
%else
- %define UNIX64
+ %define UNIX64 1
%endif
%endif
%endif
%endmacro
-%ifdef WIN64
+%if WIN64
%define PIC
-%elifndef ARCH_X86_64
+%elif ARCH_X86_64 == 0
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
%if %0 == 5
%define r%1m %3
%define r%1mp %2
- %elifdef ARCH_X86_64 ; memory
+ %elif ARCH_X86_64 ; memory
%define r%1m [rsp + stack_offset + %6]
%define r%1mp qword r %+ %1m
%else
%define e%1w %1
%define r%1b %2
%define e%1b %2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
%define r%1 e%1
%endif
%endmacro
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define gprsize 8
%else
%define gprsize 4
%assign n_arg_names %0
%endmacro
-%ifdef WIN64 ; Windows x64 ;=================================================
+%if WIN64 ; Windows x64 ;=================================================
DECLARE_REG 0, rcx, ecx, cx, cl
DECLARE_REG 1, rdx, edx, dx, dl
%endif
%endmacro
-%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
DECLARE_REG 0, rdi, edi, di, dil
DECLARE_REG 1, rsi, esi, si, sil
%endif ;======================================================================
-%ifndef WIN64
+%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
%macro WIN64_RESTORE_XMM 1
%define RESET_MM_PERMUTATION INIT_XMM %1
%define mmsize 16
%define num_mmregs 8
- %ifdef ARCH_X86_64
+ %if ARCH_X86_64
%define num_mmregs 16
%endif
%define mova movdqa
%define RESET_MM_PERMUTATION INIT_YMM %1
%define mmsize 32
%define num_mmregs 8
- %ifdef ARCH_X86_64
+ %if ARCH_X86_64
%define num_mmregs 16
%endif
%define mova vmovaps
%assign SIZEOF_PIXEL 1
%assign SIZEOF_DCTCOEF 2
%define pixel byte
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%assign SIZEOF_PIXEL 2
%assign SIZEOF_DCTCOEF 4
%define pixel word
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
%macro FIX_STRIDES 1-*
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%rep %0
add %1, %1
%rotate 1
%endmacro
%macro TRANSPOSE8x8W 9-11
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9
SBUTTERFLY wd, %3, %4, %9
SBUTTERFLY wd, %5, %6, %9
%macro LOAD_DIFF 5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova %1, %4
psubw %1, %5
%elifidn %3, none
; (high depth) in: %1, %2, min to clip, max to clip, mem128
; in: %1, tmp, %3, mem64
%macro STORE_DIFF 4-5
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
psrad %1, 6
psrad %2, 6
packssdw %1, %2
if [ "$bit_depth" -gt "8" ]; then
define HIGH_BIT_DEPTH
- ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH"
+ ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
+else
+ ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0"
fi
if [ "$chroma_format" != "all" ]; then
error_message: db "failed to preserve register", 0
-%ifdef WIN64
+%if WIN64
; just random numbers to reduce the chance of incidental match
ALIGN 16
x6: ddq 0x79445c159ce790641a1b2550a612b48c
; (max_args % 4) must equal 3 for stack alignment
%define max_args 15
-%ifdef WIN64
+%if WIN64
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
ADD rsp, max_args*8
RET
-%elifndef ARCH_X86_64
+%elif ARCH_X86_64 == 0
; just random numbers to reduce the chance of incidental match
%define n3 dword 0x6549315c