%macro LOAD_GLOBAL 4
%ifdef PIC
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
- lea r11, [%2 GLOBAL]
+ lea r11, [%2]
%ifnidn %3, 0
add r11, %3
%endif
global x264_sub8x8_dct_%1.skip_prologue
.skip_prologue:
%ifnidn %1, sse2
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 1,2
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%else
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 0,1
SWAP 1, 7
SPILL r1, 0
TRANSPOSE2x4x4W 4,5,6,7,0
UNSPILL r1, 0
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,r1
- paddw m4, [pw_32 GLOBAL]
+ paddw m4, [pw_32]
IDCT4_1D 4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
SPILL r1, 0
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6,7
cglobal x264_sub8x8_dct_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
cglobal x264_sub8x8_dct8_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
%ifdef WIN64
call .skip_prologue
movdqa m7, [r1+0x70]
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
- paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
+ paddw m0, [pw_32] ; rounding for the >>6 at the end
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
TRANSPOSE2x4x4W 0,1,2,3,8
IDCT4_1D 4,5,6,7,8,10
TRANSPOSE2x4x4W 4,5,6,7,8
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,8,10
- paddw m4, [pw_32 GLOBAL]
+ paddw m4, [pw_32]
IDCT4_1D 4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
- movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
+ movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
SUMSUB_BADC m1, m0, m3, m2, m4
LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
%else
- mova m5, [hsub_mul GLOBAL]
+ mova m5, [hsub_mul]
LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
%endif
DCT4_1D 0,1,2,3,4
movq m0, [r1+ 0]
IDCT4_1D 0,1,2,3,4,5
TRANSPOSE4x4W 0,1,2,3,4
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,4,5
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
punpckhdq m2, m0
SWAP 0, 1
- mova m1, [pw_32_0 GLOBAL]
+ mova m1, [pw_32_0]
paddw m1, m0 ; row1/row0 corrected
psraw m0, 1 ; row1>>1/...
mova m3, m2 ; row3/row2
pxor m7, m7
%else
add r2, 4*FDEC_STRIDE
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
.skip_prologue:
%ifdef WIN64
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
- paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
psraw xmm0, 6
psubw xmm1, xmm0
- movdqa xmm5, [pb_idctdc_unpack GLOBAL]
+ movdqa xmm5, [pb_idctdc_unpack]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
pshufb xmm0, xmm5
.loop:
movq mm0, [r1]
pxor mm1, mm1
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
punpcklwd xmm2, xmm2
pxor xmm1, xmm1
pxor xmm3, xmm3
- paddw xmm0, [pw_32 GLOBAL]
- paddw xmm2, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
+ paddw xmm2, [pw_32]
psraw xmm0, 6
psraw xmm2, 6
psubw xmm1, xmm0
movdqa xmm0, [r1]
add r1, 16
pxor xmm1, xmm1
- paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
psraw xmm0, 6
psubw xmm1, xmm0
- movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
- movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
+ movdqa xmm5, [ pb_idctdc_unpack]
+ movdqa xmm6, [pb_idctdc_unpack2]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movdqa xmm2, xmm0
cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
- pshufb xmm1, [pb_scan4frameb GLOBAL]
- pshufb xmm0, [pb_scan4framea GLOBAL]
+ pshufb xmm1, [pb_scan4frameb]
+ pshufb xmm0, [pb_scan4framea]
movdqa xmm2, xmm1
psrldq xmm1, 6
palignr xmm2, xmm0, 6
punpcklqdq xmm0, xmm2
punpcklqdq xmm4, xmm6
%ifidn %2, frame
- movdqa xmm7, [pb_sub4frame GLOBAL]
+ movdqa xmm7, [pb_sub4frame]
%else
- movdqa xmm7, [pb_sub4field GLOBAL]
+ movdqa xmm7, [pb_sub4field]
%endif
pshufb xmm0, xmm7
pshufb xmm4, xmm7
psubw xmm1, xmm5
%ifidn %1, ac
movd r2d, xmm0
- pand xmm0, [pb_subacmask GLOBAL]
+ pand xmm0, [pb_subacmask]
%endif
movdqa [r0], xmm0
pxor xmm2, xmm2
packsswb m5, m5
pxor m0, m0
pcmpeqb m5, m0
- paddb m5, [pb_1 GLOBAL]
+ paddb m5, [pb_1]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
packsswb m2, m2
packsswb m2, m2
pcmpeqb m5, m2
- paddb m5, [pb_1 GLOBAL]
+ paddb m5, [pb_1]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
mova m5, m1
- pxor m5, m2 ; p0^q0
- pand m5, [pb_01 GLOBAL] ; (p0^q0)&1
+ pxor m5, m2 ; p0^q0
+ pand m5, [pb_01] ; (p0^q0)&1
pcmpeqb m4, m4
pxor m3, m4
- pavgb m3, m0 ; (p1 - q1 + 256)>>1
- pavgb m3, [pb_03 GLOBAL] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
+ pavgb m3, m0 ; (p1 - q1 + 256)>>1
+ pavgb m3, [pb_03] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pxor m4, m1
- pavgb m4, m2 ; (q0 - p0 + 256)>>1
+ pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5
- paddusb m3, m4 ; d+128+33
- mova m6, [pb_a1 GLOBAL]
+ paddusb m3, m4 ; d+128+33
+ mova m6, [pb_a1]
psubusb m6, m3
- psubusb m3, [pb_a1 GLOBAL]
+ psubusb m3, [pb_a1]
pminub m6, m7
pminub m3, m7
psubusb m1, m6
%macro LUMA_Q1 6
mova %6, m1
pavgb %6, m2
- pavgb %2, %6 ; avg(p2,avg(p0,q0))
+ pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
- pand %6, [pb_01 GLOBAL] ; (p2^avg(p0,q0))&1
- psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
+ pand %6, [pb_01] ; (p2^avg(p0,q0))&1
+ psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
mova %6, %1
psubusb %6, %5
paddusb %5, %1
%define mask0 spill(2)
%define mask1p spill(3)
%define mask1q spill(4)
- %define mpb_00 [pb_00 GLOBAL]
- %define mpb_01 [pb_01 GLOBAL]
+ %define mpb_00 [pb_00]
+ %define mpb_01 [pb_01]
%endif
;-----------------------------------------------------------------------------
mova q1, [r0+r1]
%ifdef ARCH_X86_64
pxor mpb_00, mpb_00
- mova mpb_01, [pb_01 GLOBAL]
+ mova mpb_01, [pb_01]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
SWAP 7, 12 ; m12=mask0
pavgb t5, mpb_00
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
mova m4, t5
mova mask0, m7
- pavgb m4, [pb_00 GLOBAL]
- pavgb m4, [pb_01 GLOBAL] ; alpha/4+1
+ pavgb m4, [pb_00]
+ pavgb m4, [pb_01] ; alpha/4+1
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
pand m6, mask0
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
%macro CHROMA_INTRA_P0 3
movq m4, %1
pxor m4, %3
- pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1
+ pand m4, [pb_01] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%macro BIWEIGHT_START_MMX 0
movd m2, r6m
SPLATW m2, m2 ; weight_dst
- mova m3, [pw_64 GLOBAL]
+ mova m3, [pw_64]
psubw m3, m2 ; weight_src
- mova m4, [pw_32 GLOBAL] ; rounding
+ mova m4, [pw_32] ; rounding
pxor m5, m5
%endmacro
shl t7d, 8
add t6d, t7d
movd m3, t6d
- mova m4, [pw_32 GLOBAL]
+ mova m4, [pw_32]
SPLATW m3, m3 ; weight_dst,src
%endmacro
%macro INIT_SHIFT 2
and eax, 7
shl eax, 3
- movd %1, [sw_64 GLOBAL]
+ movd %1, [sw_64]
movd %2, eax
psubw %1, %2
%endmacro
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
- lea r11, [avg_w16_addr GLOBAL]
+ lea r11, [avg_w16_addr]
add r6, r11
%else
- lea r6, [avg_w16_addr + r6 GLOBAL]
+ lea r6, [avg_w16_addr + r6]
%endif
%ifdef UNIX64
jmp r6
SPLATW m5, m5 ; m5 = dx
SPLATW m6, m6 ; m6 = dy
- mova m4, [pw_8 GLOBAL]
+ mova m4, [pw_8]
mova m0, m4
psubw m4, m5 ; m4 = 8-dx
psubw m0, m6 ; m0 = 8-dy
punpcklbw m2, m3
punpcklbw m1, m3
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
pmullw m2, m5 ; line * cB
pmullw m1, m7 ; line * cD
movd m6, r4d
mov r5d, 1
.mc1d:
- mova m5, [pw_8 GLOBAL]
+ mova m5, [pw_8]
SPLATW m6, m6
- mova m7, [pw_4 GLOBAL]
+ mova m7, [pw_4]
psubw m5, m6
movifnidn r0, r0mp
movifnidn r1d, r1m
imul r4d, t0d ; (x*255+8)*(8-y)
cmp dword r6m, 4
jg .width8
- mova m5, [pw_32 GLOBAL]
+ mova m5, [pw_32]
movd m6, r5d
movd m7, r4d
movifnidn r0, r0mp
and r2, ~3
and r5, 3
%ifdef PIC
- lea r11, [ch_shuffle GLOBAL]
+ lea r11, [ch_shuffle]
movu m5, [r11 + r5*2]
%else
- movu m5, [ch_shuffle + r5*2 GLOBAL]
+ movu m5, [ch_shuffle + r5*2]
%endif
movu m0, [r2]
pshufb m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
- paddw m0, [pw_32 GLOBAL]
- paddw m2, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
+ paddw m2, [pw_32]
paddw m1, m0
paddw m3, m2
mova m0, m4
cmp r5, 0x38
jge .split
%endif
- mova m5, [pw_32 GLOBAL]
+ mova m5, [pw_32]
movh m0, [r2]
movh m1, [r2+1]
punpcklbw m0, m1
and r2, ~7
and r5, 7
%ifdef PIC
- lea r11, [ch_shuffle GLOBAL]
+ lea r11, [ch_shuffle]
movu m5, [r11 + r5*2]
%else
- movu m5, [ch_shuffle + r5*2 GLOBAL]
+ movu m5, [ch_shuffle + r5*2]
%endif
movu m0, [r2]
pshufb m0, m5
%ifdef ARCH_X86_64
- mova m8, [pw_32 GLOBAL]
+ mova m8, [pw_32]
%define round m8
%else
- %define round [pw_32 GLOBAL]
+ %define round [pw_32]
%endif
.splitloop8:
movu m1, [r2+r3]
%ifnidn %1, ssse3
pxor m0, m0
%else
- mova m0, [filt_mul51 GLOBAL]
+ mova m0, [filt_mul51]
%endif
.loop:
%ifidn %1, ssse3
pmaddubsw m4, m0
pmaddubsw m2, m0
pmaddubsw m5, m0
- pmaddubsw m3, [filt_mul20 GLOBAL]
- pmaddubsw m6, [filt_mul20 GLOBAL]
+ pmaddubsw m3, [filt_mul20]
+ pmaddubsw m6, [filt_mul20]
paddw m1, m2
paddw m4, m5
paddw m1, m3
LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
FILT_V2
%endif
- mova m7, [pw_16 GLOBAL]
+ mova m7, [pw_16]
mova [r2+r4*2], m1
mova [r2+r4*2+mmsize], m4
paddw m1, m7
lea r1, [r1+r2*2]
neg r2
%define src r1+r2*2
- movq m7, [pw_32 GLOBAL]
+ movq m7, [pw_32]
.loop:
movq m1, [src-4]
movq m2, [src-2]
punpcklbw m7, m0
punpcklbw m6, m0
paddw m6, m7 ; a1
- movq m7, [pw_1 GLOBAL]
+ movq m7, [pw_1]
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 1
movntq [r0+r2], m1
neg r2
%define src r1+r2*2
%ifidn %1, ssse3
- mova m7, [pw_32 GLOBAL]
+ mova m7, [pw_32]
%define tpw_32 m7
%elifdef ARCH_X86_64
- mova m8, [pw_32 GLOBAL]
+ mova m8, [pw_32]
%define tpw_32 m8
%else
- %define tpw_32 [pw_32 GLOBAL]
+ %define tpw_32 [pw_32]
%endif
.loop:
%ifidn %1,sse2_misalign
punpcklbw m6, m0
punpcklbw m7, m0
paddw m6, m7 ; c1
- mova m7, [pw_1 GLOBAL] ; FIXME xmm8
+ mova m7, [pw_1] ; FIXME xmm8
FILT_H2 m1, m2, m3, m4, m5, m6
FILT_PACK m1, m4, 1
movntdq [r0+r2], m1
punpcklbw m1, m0 ; 00 -1 00 -2 00 -3 00 -4 00 -5 00 -6 00 -7 00 -8
movh m2, [src]
punpcklbw m2, m0
- mova m7, [pw_1 GLOBAL]
+ mova m7, [pw_1]
.loop:
movh m3, [src+8]
punpcklbw m3, m0
mova m3, [r1]
mova %4, [r1+r2]
mova m0, [r1+r2*2]
- mova %2, [filt_mul51 GLOBAL]
+ mova %2, [filt_mul51]
mova m4, m1
punpcklbw m1, m2
punpckhbw m4, m2
pmaddubsw m4, %2
pmaddubsw m0, %2
pmaddubsw m2, %2
- pmaddubsw m3, [filt_mul20 GLOBAL]
- pmaddubsw %1, [filt_mul20 GLOBAL]
+ pmaddubsw m3, [filt_mul20]
+ pmaddubsw %1, [filt_mul20]
psrlw %3, 8
psrlw %4, 8
paddw m1, m0
add r4, r5
neg r5
pxor xmm5, xmm5
- movdqa xmm4, [pd_128 GLOBAL]
+ movdqa xmm4, [pd_128]
.loop:
movq xmm2, [r2+r5] ; intra
movq xmm0, [r4+r5] ; invq
%endmacro
%macro HADDW 2
- pmaddwd %1, [pw_1 GLOBAL]
+ pmaddwd %1, [pw_1]
HADDD %1, %2
%endmacro
%endif
%ifidn %3, ssse3
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%elifidn %3, sse2
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%elif %1 >= mmsize
pxor m7, m7
%endif
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
%if %1
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%else
pxor m7, m7 ; zero
%endif
cglobal x264_pixel_var2_8x8_ssse3, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
mov r5d, 2
.loop:
movq m0, [r0]
%macro SATD_START_SSE2 3
%ifnidn %1, sse2
- mova %3, [hmul_8p GLOBAL]
+ mova %3, [hmul_8p]
%endif
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
cglobal x264_pixel_satd_4x4_%1, 4, 6, 6
SATD_START_MMX
- mova m4, [hmul_4p GLOBAL]
+ mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
cglobal x264_pixel_satd_4x8_%1, 4, 6, 8
SATD_START_MMX
%ifnidn %1, sse2
- mova m7, [hmul_4p GLOBAL]
+ mova m7, [hmul_4p]
%endif
movd m4, [r2]
movd m5, [r2+r3]
cglobal x264_pixel_satd_16x8_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%endif
jmp x264_pixel_satd_16x8_internal_%1
cglobal x264_pixel_satd_16x16_%1, 4,6,12
SATD_START_SSE2 %1, m10, m7
%ifidn %1, sse2
- mova m7, [pw_00ff GLOBAL]
+ mova m7, [pw_00ff]
%endif
call x264_pixel_satd_16x4_internal_%1
call x264_pixel_satd_16x4_internal_%1
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
%endif
call x264_pixel_sa8d_8x8_internal_%1
HADDW m0, m1
lea r4, [3*r1]
lea r5, [3*r3]
%ifnidn %1, sse2
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
%endif
call x264_pixel_sa8d_8x8_internal_%1 ; pix[0]
add r2, 8
paddw m0, m1
HADAMARD2_2D 2, 6, 3, 7, 5, qdq, amax
%else ; non-sse2
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
LOAD_SUMSUB_8x4P 0, 1, 2, 3, 5, 6, 7, r0, r2, 1
; could do first HADAMARD4_V here to save spilling later
; surprisingly, not a win on conroe or even p4
paddusw m2, m0
; 3x HADDW
- movdqa m7, [pw_1 GLOBAL]
+ movdqa m7, [pw_1]
pmaddwd m2, m7
pmaddwd m14, m7
pmaddwd m15, m7
ret
cglobal x264_hadamard_ac_8x8_mmxext
- mova m6, [mask_ac4 GLOBAL]
+ mova m6, [mask_ac4]
pxor m7, m7
call x264_hadamard_ac_4x4_mmxext
add r0, 4
mova m3, m0
paddusw m1, [rsp+0x38]
pxor m3, m2
- pand m3, [pw_1 GLOBAL]
+ pand m3, [pw_1]
pavgw m0, m2
psubusw m0, m3
HADDUW m0, m2
%endif
%ifnidn %1, sse2
;LOAD_INC loads sumsubs
- mova m7, [hmul_8p GLOBAL]
+ mova m7, [hmul_8p]
%else
;LOAD_INC only unpacks to words
pxor m7, m7
paddw m1, m2
SUMSUB_BA m0, m4; m2
%ifnidn %1, sse2
- pand m1, [mask_ac4b GLOBAL]
+ pand m1, [mask_ac4b]
%else
- pand m1, [mask_ac4 GLOBAL]
+ pand m1, [mask_ac4]
%endif
ABS_MOV m2, spill0
paddw m1, m3
paddw m2, m1
paddw m2, m2
ABS1 m4, m7
- pand m0, [mask_ac8 GLOBAL]
+ pand m0, [mask_ac8]
ABS1 m0, m7
paddw m2, m4
paddw m0, m2
SSIM_ITER 3
; PHADDW m1, m2
; PHADDD m3, m4
- movdqa m7, [pw_1 GLOBAL]
+ movdqa m7, [pw_1]
pshufd m5, m3, 0xb1
pmaddwd m1, m7
pmaddwd m2, m7
paddd m1, m2
paddd m2, m3
paddd m3, m4
- movdqa m5, [ssim_c1 GLOBAL]
- movdqa m6, [ssim_c2 GLOBAL]
+ movdqa m5, [ssim_c1]
+ movdqa m6, [ssim_c2]
TRANSPOSE4x4D 0, 1, 2, 3, 4
; s1=m0, s2=m1, ss=m2, s12=m3
je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
neg r2
%ifdef PIC
- lea r3, [mask_ff + 16 GLOBAL]
+ lea r3, [mask_ff + 16]
movdqu m1, [r3 + r2*4]
%else
- movdqu m1, [mask_ff + r2*4 + 16 GLOBAL]
+ movdqu m1, [mask_ff + r2*4 + 16]
%endif
pand m4, m1
.skip:
pavgb %2, %3
pxor %3, %5
mov%6 %1, %4
- pand %3, [pb_1 GLOBAL]
+ pand %3, [pb_1]
psubusb %2, %3
pavgb %1, %2
%endmacro
pxor mm1, mm1
psadbw mm0, [r1+7]
psadbw mm1, [r1+16]
- paddw mm0, [pw_8 GLOBAL]
+ paddw mm0, [pw_8]
paddw mm0, mm1
psrlw mm0, 4
pshufw mm0, mm0, 0
cglobal %1, 2,2
pxor mm0, mm0
psadbw mm0, [r1+%2]
- paddw mm0, [pw_4 GLOBAL]
+ paddw mm0, [pw_4]
psrlw mm0, 3
pshufw mm0, mm0, 0
packuswb mm0, mm0
cglobal predict_8x8c_p_core_mmxext, 1,2
LOAD_PLANE_ARGS
movq mm1, mm2
- pmullw mm2, [pw_3210 GLOBAL]
+ pmullw mm2, [pw_3210]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
- pmullw mm5, [pw_3210 GLOBAL]
+ pmullw mm5, [pw_3210]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
;-----------------------------------------------------------------------------
cglobal predict_8x8_vr_sse2, 2,2,7
movdqu xmm0, [r1+8]
- movdqa xmm6, [pw_ff00 GLOBAL]
+ movdqa xmm6, [pw_ff00]
add r0, 4*FDEC_STRIDE
movdqa xmm1, xmm0
movdqa xmm2, xmm0
add r0, 4*FDEC_STRIDE
%ifidn %1, ssse3
movq mm5, [r1+7]
- movq mm6, [pb_reverse GLOBAL]
+ movq mm6, [pb_reverse]
movq mm1, mm5
movq mm2, mm5
movq mm3, mm5
%macro PRED_8x8C_H 1
cglobal predict_8x8c_h_%1, 1,1
%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+ mova m1, [pb_3]
%endif
%assign n 0
%rep 8
pshufw mm2, r2m, 0
%endif
psrlw mm0, 3
- paddw mm1, [pw_2 GLOBAL]
+ paddw mm1, [pw_2]
movq mm3, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
punpcklqdq xmm0, xmm0
punpcklqdq xmm2, xmm2
punpcklqdq xmm4, xmm4
- pmullw xmm2, [pw_76543210 GLOBAL]
+ pmullw xmm2, [pw_76543210]
paddsw xmm0, xmm2 ; xmm0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
movdqa xmm3, xmm0
paddsw xmm3, xmm4
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
movdqa xmm3, xmm1
- pmullw xmm3, [pw_76543210 GLOBAL]
+ pmullw xmm3, [pw_76543210]
psllw xmm1, 3
paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b}
cglobal predict_16x16_h_%1, 1,2
mov r1, FDEC_STRIDE*12
%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+ mova m1, [pb_3]
%endif
.vloop:
%assign n 0
REP_RET
cglobal predict_16x16_dc_top_mmxext, 1,2
- PRED16x16_DC [pw_8 GLOBAL], 4
+ PRED16x16_DC [pw_8], 4
REP_RET
cglobal predict_16x16_dc_left_core_mmxext, 1,1
RET
cglobal predict_16x16_dc_top_sse2, 1,1
- PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4
+ PRED16x16_DC_SSE2 [pw_8], 4
RET
cglobal predict_16x16_dc_left_core_sse2, 1,1
%endmacro
%macro QUANT_DC_START_SSSE3 0
- movdqa m5, [pb_01 GLOBAL]
+ movdqa m5, [pb_01]
movd m6, r1m ; mf
movd m7, r2m ; bias
pshufb m6, m5
.rshift32:
neg t0d
movd m2, t0d
- mova m3, [pd_1 GLOBAL]
+ mova m3, [pd_1]
pxor m4, m4
pslld m3, m2
psrld m3, 1
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %3
%ifdef PIC
- lea r1, [dequant%2_scale GLOBAL]
+ lea r1, [dequant%2_scale]
add r1, t2
%else
- lea r1, [dequant%2_scale + t2 GLOBAL]
+ lea r1, [dequant%2_scale + t2]
%endif
movifnidn r0, r0mp
movd m4, t0d
.rshift32:
neg t0d
movd m3, t0d
- mova m4, [pw_1 GLOBAL]
+ mova m4, [pw_1]
mova m5, m4
pslld m4, m3
psrld m4, 1
;This is not true for score64.
cglobal x264_decimate_score%1_%2, 1,3
%ifdef PIC
- lea r10, [x264_decimate_table4 GLOBAL]
- lea r11, [decimate_mask_table4 GLOBAL]
+ lea r10, [x264_decimate_table4]
+ lea r11, [decimate_mask_table4]
%define table r10
%define mask_table r11
%else
%define table x264_decimate_table4
%define mask_table decimate_mask_table4
%endif
- DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
+ DECIMATE_MASK edx, eax, r0, [pb_1], %2, ecx
xor edx, 0xffff
je .ret
test eax, eax
%ifdef ARCH_X86_64
cglobal x264_decimate_score64_%1, 1,4
%ifdef PIC
- lea r10, [x264_decimate_table8 GLOBAL]
+ lea r10, [x264_decimate_table8]
%define table r10
%else
%define table x264_decimate_table8
%endif
- mova m5, [pb_1 GLOBAL]
+ mova m5, [pb_1]
DECIMATE_MASK r1d, eax, r0, m5, %1, null
test eax, eax
jne .ret9
%else
cglobal x264_decimate_score64_%1, 1,5
%endif
- mova m7, [pb_1 GLOBAL]
+ mova m7, [pb_1]
DECIMATE_MASK r3, r2, r0, m7, %1, r5
test r2, r2
jne .ret9
psadbw m0, m7
psadbw m1, m6
paddw m0, m1
- paddw m0, [pw_8 GLOBAL]
+ paddw m0, [pw_8]
psrlw m0, 4
punpcklbw m0, m0
pshufw m0, m0, 0x0 ;DC prediction
movq m6, [r1 - FDEC_STRIDE]
add r1, FDEC_STRIDE*4
%ifidn %1,ssse3
- movq m7, [pb_3 GLOBAL]
+ movq m7, [pb_3]
%endif
INTRA_SAD_HV_ITER 0, %1
INTRA_SAD_HV_ITER 2, %1
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifidn %1, ssse3
movq2dq xmm0, m0
- pshufb xmm0, [pb_shuf8x8c GLOBAL]
+ pshufb xmm0, [pb_shuf8x8c]
movq xmm1, [r0+FENC_STRIDE*0]
movq xmm2, [r0+FENC_STRIDE*1]
movq xmm3, [r0+FENC_STRIDE*2]
paddw mm0, mm1
movd r3d, mm0
%ifidn %1, ssse3
- mova m1, [pb_3 GLOBAL]
+ mova m1, [pb_3]
%endif
%assign x 0
%rep 16
%endif
%define sad_w16_addr (sad_w16_align1_%1 + (sad_w16_align1_%1 - sad_w16_align2_%1))
%ifdef PIC
- lea r5, [sad_w16_addr GLOBAL]
+ lea r5, [sad_w16_addr]
add r5, r4
%else
- lea r5, [sad_w16_addr + r4 GLOBAL]
+ lea r5, [sad_w16_addr + r4]
%endif
and r2, ~15
mov r4d, %2/2
jle x264_pixel_sad_%1x%2_mmxext
and eax, 7
shl eax, 3
- movd mm6, [sw_64 GLOBAL]
+ movd mm6, [sw_64]
movd mm7, eax
psubw mm6, mm7
PROLOGUE 4,5
%endif
%endmacro
-; PIC support macros.
-; x86_64 can't fit 64bit address literals in most instruction types,
-; so shared objects (under the assumption that they might be anywhere
-; in memory) must use an address mode that does fit.
-; So all accesses to global variables must use this macro, e.g.
-; mov eax, [foo GLOBAL]
-; instead of
-; mov eax, [foo]
-;
-; x86_32 doesn't require PIC.
-; Some distros prefer shared objects to be PIC, but nothing breaks if
-; the code contains a few textrels, so we'll skip that complexity.
-
%ifdef WIN64
%define PIC
%elifndef ARCH_X86_64
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
%undef PIC
%endif
%ifdef PIC
- %define GLOBAL wrt rip
-%else
- %define GLOBAL
+ default rel
%endif
; Macros to eliminate most code duplication between x86_32 and x86_64:
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
-%define mask [mask_10 GLOBAL]
+%define mask [mask_10]
%define shift 16
%elifidn %1, q
-%define mask [mask_1100 GLOBAL]
+%define mask [mask_1100]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
%endrep
%assign i 6
%rep 16-6
- movdqa xmm %+ i, [x %+ i GLOBAL]
+ movdqa xmm %+ i, [x %+ i]
%assign i i+1
%endrep
- mov r4, [n4 GLOBAL]
- mov r5, [n5 GLOBAL]
+ mov r4, [n4]
+ mov r5, [n5]
call r6
- xor r4, [n4 GLOBAL]
- xor r5, [n5 GLOBAL]
+ xor r4, [n4]
+ xor r5, [n5]
or r4, r5
pxor xmm5, xmm5
%assign i 6
%rep 16-6
- pxor xmm %+ i, [x %+ i GLOBAL]
+ pxor xmm %+ i, [x %+ i]
por xmm5, xmm %+ i
%assign i i+1
%endrep
or r4, r5
jz .ok
mov r4, rax
- lea r0, [error_message GLOBAL]
+ lea r0, [error_message]
call puts
mov r1, [rsp+stack_offset+16]
mov dword [r1], 0
or r3, r5
jz .ok
mov r3, eax
- lea r1, [error_message GLOBAL]
+ lea r1, [error_message]
push r1
call puts
add esp, 4