;*****************************************************************************
;* mc-a.asm: x86 motion compensation
;*****************************************************************************
-;* Copyright (C) 2003-2013 x264 project
+;* Copyright (C) 2003-2016 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
SECTION_RODATA 32
-pw_512: times 16 dw 512
ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
ch_shuf_adj: times 8 db 0
times 8 db 2
cextern pw_8
cextern pw_32
cextern pw_64
+cextern pw_512
cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
.height_loop:
movu m0, [r2]
movu m1, [r2+r3*2]
-%if mmsize == 8
+%if cpuflag(avx) || mmsize == 8
pavgw m0, [r2+r4]
pavgw m1, [r2+r6]
%else
AVG2_W_ONE 8
AVG2_W_TWO 10, movd, movd
AVG2_W_TWO 16, movu, mova
+INIT_YMM avx2
+AVG2_W_ONE 16
INIT_MMX
cglobal pixel_avg2_w10_mmx2, 6,7
jg .height_loop
RET
-INIT_XMM
-cglobal pixel_avg2_w18_sse2, 6,7,6
+%macro PIXEL_AVG_W18 0
+cglobal pixel_avg2_w18, 6,7
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
+ movd xm2, [r2+32]
+%if mmsize == 32
+ pavgw m0, [r2+r4+ 0]
+ movd xm1, [r2+r4+32]
+ pavgw xm2, xm1
+%else
movu m1, [r2+16]
- movh m2, [r2+32]
movu m3, [r2+r4+ 0]
movu m4, [r2+r4+16]
- movh m5, [r2+r4+32]
+ movd m5, [r2+r4+32]
pavgw m0, m3
pavgw m1, m4
pavgw m2, m5
- mova [r0+ 0], m0
mova [r0+16], m1
- movh [r0+32], m2
+%endif
+ mova [r0+ 0], m0
+ movd [r0+32], xm2
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
RET
+%endmacro
+
+INIT_XMM sse2
+PIXEL_AVG_W18
+INIT_YMM avx2
+PIXEL_AVG_W18
+
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
jg .height_loop
RET
+INIT_XMM
cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm2, [r2+r3]
- movdqu xmm1, [r2+r4]
- movdqu xmm3, [r2+r6]
+ movu m0, [r2]
+ movu m2, [r2+r3]
+ movu m1, [r2+r4]
+ movu m3, [r2+r6]
lea r2, [r2+r3*2]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm2
+ pavgb m0, m1
+ pavgb m2, m3
+ mova [r0], m0
+ mova [r0+r1], m2
lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
+ sub r5d, 2
+ jg .height_loop
RET
-%macro AVG2_W20 1
-cglobal pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_sse2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
- movdqu xmm0, [r4]
- movdqu xmm2, [r4+r3]
-%ifidn %1, sse2_misalign
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb xmm0, [r4+r2]
- pavgb xmm2, [r4+r6]
-%else
- movdqu xmm1, [r4+r2]
- movdqu xmm3, [r4+r6]
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
-%endif
- pavgb mm4, [r4+r2+16]
- pavgb mm5, [r4+r6+16]
+ movu m0, [r4]
+ movu m2, [r4+r3]
+ movu m1, [r4+r2]
+ movu m3, [r4+r6]
+ movd mm4, [r4+16]
+ movd mm5, [r4+r3+16]
+ pavgb m0, m1
+ pavgb m2, m3
+ pavgb mm4, [r4+r2+16]
+ pavgb mm5, [r4+r6+16]
lea r4, [r4+r3*2]
- movdqa [r0], xmm0
- movd [r0+16], mm4
- movdqa [r0+r1], xmm2
- movd [r0+r1+16], mm5
+ mova [r0], m0
+ mova [r0+r1], m2
+ movd [r0+16], mm4
+ movd [r0+r1+16], mm5
lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
+ sub r5d, 2
+ jg .height_loop
RET
-%endmacro
-
-AVG2_W20 sse2
-AVG2_W20 sse2_misalign
INIT_YMM avx2
cglobal pixel_avg2_w20, 6,7
movu m1, [r2+%4*mmsize]
movu m2, [r2+r3+%3*mmsize]
movu m3, [r2+r3+%4*mmsize]
- movu m4, [r2+r3*2+%3*mmsize]
- movu m5, [r2+r3*2+%4*mmsize]
- movu m6, [r2+%2+%3*mmsize]
- movu m7, [r2+%2+%4*mmsize]
mova [r0+%3*mmsize], m0
mova [r0+%4*mmsize], m1
mova [r0+r1+%3*mmsize], m2
mova [r0+r1+%4*mmsize], m3
- mova [r0+r1*2+%3*mmsize], m4
- mova [r0+r1*2+%4*mmsize], m5
- mova [r0+%1+%3*mmsize], m6
- mova [r0+%1+%4*mmsize], m7
+ movu m0, [r2+r3*2+%3*mmsize]
+ movu m1, [r2+r3*2+%4*mmsize]
+ movu m2, [r2+%2+%3*mmsize]
+ movu m3, [r2+%2+%4*mmsize]
+ mova [r0+r1*2+%3*mmsize], m0
+ mova [r0+r1*2+%4*mmsize], m1
+ mova [r0+%1+%3*mmsize], m2
+ mova [r0+%1+%4*mmsize], m3
%endmacro
%macro COPY4 2
%macro MC_COPY 1
%assign %%w %1*SIZEOF_PIXEL/mmsize
%if %%w > 0
-cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+cglobal mc_copy_w%1, 5,7
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
MC_COPY 16
INIT_XMM aligned, sse
MC_COPY 16
-
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+MC_COPY 16
+INIT_YMM aligned, avx
+MC_COPY 16
+%endif
;=============================================================================
; prefetch
%endmacro
%else ; !HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 3
-%if mmsize == 8 || cpuflag(misalign)
+%if mmsize == 8
punpcklwd %1, %3
%else
movh %2, %3
%if ARCH_X86_64 ; too many regs for x86_32
RESET_MM_PERMUTATION
%if WIN64
-%if xmm_regs_used > 6
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
- %assign xmm_regs_used 6
-%endif
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
%endif
.mc1dy:
and t2d, 7
%macro MC_CHROMA_SSSE3 0
cglobal mc_chroma
-%if cpuflag(avx2)
- MC_CHROMA_START 9
-%else
- MC_CHROMA_START 10
-%endif
+ MC_CHROMA_START 10-cpuflag(avx2)
and r5d, 7
and t2d, 7
mov t0d, r5d
movhps [r1+r2], xm2
%else
movu m0, [r3]
- pshufb m0, xm5
+ pshufb m0, m5
.loop4:
movu m1, [r3+r4]
pshufb m1, m5
pmulhrsw m3, shiftround
mova m0, m4
packuswb m1, m3
+ movd [r0], m1
+%if cpuflag(sse4)
+ pextrd [r1], m1, 1
+ pextrd [r0+r2], m1, 2
+ pextrd [r1+r2], m1, 3
+%else
movhlps m3, m1
- movd [r0], xm1
movd [r0+r2], m3
psrldq m1, 4
psrldq m3, 4
movd [r1], m1
movd [r1+r2], m3
+%endif
lea r3, [r3+r4*2]
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
-INIT_XMM sse2, misalign
-MC_CHROMA
INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3