;*****************************************************************************
;* mc-a2.asm: x86 motion compensation
;*****************************************************************************
-;* Copyright (C) 2005-2010 x264 project
+;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
+pf_inv256: times 8 dd 0.00390625
pad10: times 8 dw 10*PIXEL_MAX
pad20: times 8 dw 20*PIXEL_MAX
cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
-cextern pd_128
cextern pd_ffff
%macro LOAD_ADD 4
%macro LOAD_ADD_2 6
mova %5, %3
mova %1, %4
- mova %6, %5
- mova %2, %1
+ punpckhbw %6, %5, m0
punpcklbw %5, m0
+ punpckhbw %2, %1, m0
punpcklbw %1, m0
- punpckhbw %6, m0
- punpckhbw %2, m0
paddw %1, %5
paddw %2, %6
%endmacro
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-%macro HPEL_FILTER 1
-cglobal hpel_filter_v_%1, 5,6,11*(mmsize/16)
+%macro HPEL_FILTER 0
+cglobal hpel_filter_v, 5,6,11
FIX_STRIDES r3d, r4d
%ifdef WIN64
movsxd r4, r4d
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c_%1, 3,3,10*(mmsize/16)
+cglobal hpel_filter_c, 3,3,10
add r2, r2
add r0, r2
lea r1, [r1+r2]
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_%1, 3,4,8*(mmsize/16)
+cglobal hpel_filter_h, 3,4,8
%define src r1+r2
add r2, r2
add r0, r2
mova [r0+r2-mmsize*1], m4
jl .loop
REP_RET
-%endmacro
+%endmacro ; HPEL_FILTER
-INIT_MMX
-HPEL_FILTER mmxext
-INIT_XMM
-HPEL_FILTER sse2
+INIT_MMX mmx2
+HPEL_FILTER
+INIT_XMM sse2
+HPEL_FILTER
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
-INIT_MMX
-
-%macro HPEL_V 1-2 0
+%macro HPEL_V 1
;-----------------------------------------------------------------------------
; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_v_%1, 5,6,%2
+cglobal hpel_filter_v, 5,6,%1
%ifdef WIN64
movsxd r4, r4d
%endif
add r0, r4
lea r2, [r2+r4*2]
neg r4
-%ifnidn %1, ssse3
- pxor m0, m0
-%else
+%if cpuflag(ssse3)
mova m0, [filt_mul15]
+%else
+ pxor m0, m0
%endif
.loop:
-%ifidn %1, ssse3
+%if cpuflag(ssse3)
mova m1, [r1]
mova m4, [r1+r3]
mova m2, [r5+r3*2]
LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
- LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
+ LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
FILT_V2 m1, m2, m3, m4, m5, m6
%endif
mova m7, [pw_16]
jl .loop
REP_RET
%endmacro
-HPEL_V mmxext
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c_mmxext, 3,3
+INIT_MMX
+cglobal hpel_filter_c_mmx2, 3,3
add r0, r2
lea r1, [r1+r2*2]
neg r2
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_mmxext, 3,3
+cglobal hpel_filter_h_mmx2, 3,3
add r0, r2
add r1, r2
neg r2
INIT_XMM
-%macro HPEL_C 1
+%macro HPEL_C 0
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c_%1, 3,3,9
+cglobal hpel_filter_c, 3,3,9
add r0, r2
lea r1, [r1+r2*2]
neg r2
%define src r1+r2*2
-%ifnidn %1, sse2
+%ifnidn cpuname, sse2
mova m7, [pw_32]
%define tpw_32 m7
%elifdef ARCH_X86_64
%else
%define tpw_32 [pw_32]
%endif
-%ifidn %1,sse2_misalign
+%if cpuflag(misalign)
.loop:
movu m4, [src-4]
movu m5, [src-2]
mova m1, [src]
.loop:
mova m2, [src+16]
- mova m4, m1
- PALIGNR m4, m0, 12, m7
- mova m5, m1
- PALIGNR m5, m0, 14, m0
- mova m0, m2
- PALIGNR m0, m1, 6, m7
+ PALIGNR m4, m1, m0, 12, m7
+ PALIGNR m5, m1, m0, 14, m0
+ PALIGNR m0, m2, m1, 6, m7
paddw m4, m0
- mova m0, m2
- PALIGNR m0, m1, 4, m7
+ PALIGNR m0, m2, m1, 4, m7
paddw m5, m0
- mova m6, m2
- PALIGNR m6, m1, 2, m7
+ PALIGNR m6, m2, m1, 2, m7
paddw m6, m1
FILT_H m4, m5, m6
PALIGNR m2, m1, 12, m7
PALIGNR m5, m1, 14, m1
mova m1, [src+32]
- mova m3, m1
- PALIGNR m3, m0, 6, m7
+ PALIGNR m3, m1, m0, 6, m7
paddw m3, m2
- mova m6, m1
- PALIGNR m6, m0, 4, m7
+ PALIGNR m6, m1, m0, 4, m7
paddw m5, m6
- mova m6, m1
- PALIGNR m6, m0, 2, m7
+ PALIGNR m6, m1, m0, 2, m7
paddw m6, m0
FILT_H m3, m5, m6
%endif
mova m7, [pw_16]
.loop:
mova m2, [src+16]
- mova m3, m1
- palignr m3, m0, 14
- mova m4, m1
- palignr m4, m0, 15
- mova m0, m2
- palignr m0, m1, 2
+ palignr m3, m1, m0, 14
+ palignr m4, m1, m0, 15
+ palignr m0, m2, m1, 2
pmaddubsw m3, [filt_mul15]
pmaddubsw m4, [filt_mul15]
pmaddubsw m0, [filt_mul51]
- mova m5, m2
- palignr m5, m1, 1
- mova m6, m2
- palignr m6, m1, 3
+ palignr m5, m2, m1, 1
+ palignr m6, m2, m1, 3
paddw m3, m0
mova m0, m1
pmaddubsw m1, [filt_mul20]
add r2, 16
jl .loop
REP_RET
-%endif
-
-%define PALIGNR PALIGNR_MMX
+%endif ; !ARCH_X86_64
+
+INIT_MMX mmx2
+HPEL_V 0
+INIT_XMM sse2
+HPEL_V 8
+INIT_XMM sse2, misalign
+HPEL_C
%ifndef ARCH_X86_64
-HPEL_C sse2
+INIT_XMM sse2
+HPEL_C
+INIT_XMM ssse3
+HPEL_C
+HPEL_V 0
+INIT_XMM avx
+HPEL_C
+HPEL_V 0
%endif
-HPEL_V sse2, 8
-HPEL_C sse2_misalign
-%define PALIGNR PALIGNR_SSSE3
-HPEL_C ssse3
-HPEL_V ssse3
%ifdef ARCH_X86_64
-
-%macro DO_FILT_V 6
+%macro DO_FILT_V 5
;The optimum prefetch distance is difficult to determine in checkasm:
;any prefetch seems slower than not prefetching.
;In real use, the prefetch seems to be a slight win.
;+16 is picked somewhat arbitrarily here based on the fact that even one
;loop iteration is going to take longer than the prefetch.
prefetcht0 [r1+r2*2+16]
-%ifidn %6, ssse3
+%if cpuflag(ssse3)
mova m1, [r3]
mova m2, [r3+r2]
mova %3, [r3+r2*2]
mova m3, [r1]
mova %1, [r1+r2]
mova %2, [r1+r2*2]
- mova m4, m1
+ punpckhbw m4, m1, m2
punpcklbw m1, m2
- punpckhbw m4, m2
- mova m2, %1
+ punpckhbw m2, %1, %2
punpcklbw %1, %2
- punpckhbw m2, %2
- mova %2, m3
+ punpckhbw %2, m3, %3
punpcklbw m3, %3
- punpckhbw %2, %3
pmaddubsw m1, m12
pmaddubsw m4, m12
%endmacro
%macro FILT_C 4
- mova m1, %2
- PALIGNR m1, %1, 12, m2
- mova m2, %2
- PALIGNR m2, %1, 14, %1
- mova m3, %3
- PALIGNR m3, %2, 4, %1
- mova m4, %3
- PALIGNR m4, %2, 2, %1
+ PALIGNR m1, %2, %1, 12, m2
+ PALIGNR m2, %2, %1, 14, %1
+ PALIGNR m3, %3, %2, 4, %1
+ PALIGNR m4, %3, %2, 2, %1
paddw m3, m2
mova %1, %3
PALIGNR %3, %2, 6, m2
%endmacro
%macro ADD8TO16 5
- mova %3, %1
- mova %4, %2
+ punpckhbw %3, %1, %5
punpcklbw %1, %5
+ punpcklbw %4, %2, %5
punpckhbw %2, %5
- punpckhbw %3, %5
- punpcklbw %4, %5
paddw %2, %3
paddw %1, %4
%endmacro
-%macro DO_FILT_H 4
- mova m1, %2
- PALIGNR m1, %1, 14, m3
- mova m2, %2
- PALIGNR m2, %1, 15, m3
- mova m4, %3
- PALIGNR m4, %2, 1 , m3
- mova m5, %3
- PALIGNR m5, %2, 2 , m3
- mova m6, %3
- PALIGNR m6, %2, 3 , m3
+%macro DO_FILT_H 3
+ PALIGNR m1, %2, %1, 14, m3
+ PALIGNR m2, %2, %1, 15, m3
+ PALIGNR m4, %3, %2, 1 , m3
+ PALIGNR m5, %3, %2, 2 , m3
+ PALIGNR m6, %3, %2, 3 , m3
mova %1, %2
-%ifidn %4, sse2
- ADD8TO16 m1, m6, m12, m3, m0 ; a
- ADD8TO16 m2, m5, m12, m3, m0 ; b
- ADD8TO16 %2, m4, m12, m3, m0 ; c
- FILT_V2 m1, m2, %2, m6, m5, m4
- FILT_PACK m1, m6, 5, m15
-%else ; ssse3
+%if cpuflag(ssse3)
pmaddubsw m1, m12
pmaddubsw m2, m12
pmaddubsw %2, m14
paddw m2, m6
FILT_PACK m1, m2, 5, m15
pshufb m1, [hpel_shuf]
+%else ; ssse3, avx
+ ADD8TO16 m1, m6, m12, m3, m0 ; a
+ ADD8TO16 m2, m5, m12, m3, m0 ; b
+ ADD8TO16 %2, m4, m12, m3, m0 ; c
+ FILT_V2 m1, m2, %2, m6, m5, m4
+ FILT_PACK m1, m6, 5, m15
%endif
movntps [r0+r4], m1
mova %2, %3
%endmacro
-%macro HPEL 1
+%macro HPEL 0
;-----------------------------------------------------------------------------
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; uint8_t *src, int stride, int width, int height)
;-----------------------------------------------------------------------------
-cglobal hpel_filter_%1, 7,7,16
+cglobal hpel_filter, 7,7,16
%ifdef WIN64
movsxd r4, r4d
movsxd r5, r5d
sub r3, r2
mov r4, r10
mova m15, [pw_16]
-%ifidn %1, sse2
- pxor m0, m0
-%else ; ssse3
+%if cpuflag(ssse3)
mova m0, [filt_mul51]
mova m12, [filt_mul15]
mova m14, [filt_mul20]
+%else
+ pxor m0, m0
%endif
;ALIGN 16
.loopy:
; first filter_v
- DO_FILT_V m8, m7, m13, m12, 0, %1
+ DO_FILT_V m8, m7, m13, m12, 0
;ALIGN 16
.loopx:
- DO_FILT_V m6, m5, m11, m12, 16, %1
+ DO_FILT_V m6, m5, m11, m12, 16
.lastx:
paddw m15, m15 ; pw_32
DO_FILT_C m9, m8, m7, m6
psrlw m15, 1 ; pw_16
movdqa m7, m5
- DO_FILT_H m10, m13, m11, %1
+ DO_FILT_H m10, m13, m11
add r4, 16
jl .loopx
cmp r4, 16
RET
%endmacro
-%define PALIGNR PALIGNR_MMX
-HPEL sse2
-%define PALIGNR PALIGNR_SSSE3
-HPEL ssse3
-%endif
+INIT_XMM sse2
+HPEL
+INIT_XMM ssse3
+HPEL
+INIT_XMM avx
+HPEL
+%endif ; ARCH_X86_64
%undef movntq
%undef movntps
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void plane_copy_core( uint8_t *dst, int i_dst,
-; uint8_t *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, int i_dst,
+; pixel *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
-cglobal plane_copy_core_mmxext, 6,7
+INIT_MMX
+cglobal plane_copy_core_mmx2, 6,7
+ FIX_STRIDES r1d, r3d, r4d
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
sub r6d, 63
.loopx:
prefetchnta [r2+256]
- movq mm0, [r2 ]
- movq mm1, [r2+ 8]
- movntq [r0 ], mm0
- movntq [r0+ 8], mm1
- movq mm2, [r2+16]
- movq mm3, [r2+24]
- movntq [r0+16], mm2
- movntq [r0+24], mm3
- movq mm4, [r2+32]
- movq mm5, [r2+40]
- movntq [r0+32], mm4
- movntq [r0+40], mm5
- movq mm6, [r2+48]
- movq mm7, [r2+56]
- movntq [r0+48], mm6
- movntq [r0+56], mm7
+ movq m0, [r2 ]
+ movq m1, [r2+ 8]
+ movntq [r0 ], m0
+ movntq [r0+ 8], m1
+ movq m2, [r2+16]
+ movq m3, [r2+24]
+ movntq [r0+16], m2
+ movntq [r0+24], m3
+ movq m4, [r2+32]
+ movq m5, [r2+40]
+ movntq [r0+32], m4
+ movntq [r0+40], m5
+ movq m6, [r2+48]
+ movq m7, [r2+56]
+ movntq [r0+48], m6
+ movntq [r0+56], m7
add r2, 64
add r0, 64
sub r6d, 64
add r6d, 63
jle .end16
.loop16:
- movq mm0, [r2 ]
- movq mm1, [r2+8]
- movntq [r0 ], mm0
- movntq [r0+8], mm1
+ movq m0, [r2 ]
+ movq m1, [r2+8]
+ movntq [r0 ], m0
+ movntq [r0+8], m1
add r2, 16
add r0, 16
sub r6d, 16
emms
RET
-%ifdef HIGH_BIT_DEPTH
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
-%if mmsize==16
- mov%4 m0, [%2]
- mov%4 m1, [%3]
- SBUTTERFLY wd, 0, 1, 2
- mov%5a [%1+ 0], m0
- mov%5a [%1+16], m1
+%ifdef HIGH_BIT_DEPTH
+%assign x 0
+%rep 16/mmsize
+ mov%4 m0, [%2+(x/2)*mmsize]
+ mov%4 m1, [%3+(x/2)*mmsize]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ mov%5a [%1+(x+0)*mmsize], m0
+ mov%5a [%1+(x+1)*mmsize], m2
+ %assign x (x+2)
+%endrep
%else
- movq m0, [%2+0]
- movq m1, [%3+0]
- SBUTTERFLY wd, 0, 1, 2
- mov%5q [%1+ 0], m0
- mov%5q [%1+ 8], m1
- movq m0, [%2+8]
- movq m1, [%3+8]
- SBUTTERFLY wd, 0, 1, 2
- mov%5q [%1+16], m0
- mov%5q [%1+24], m1
-%endif
-%endmacro
-
-%macro PLANE_INTERLEAVE 1
-;-----------------------------------------------------------------------------
-; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
-;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2_%1, 4,5
- mov r4d, 16
- FIX_STRIDES r1
-.loop:
- INTERLEAVE r0, r2, r3, a
- add r2, FDEC_STRIDEB
- add r3, FDEC_STRIDEB
- add r0, r1
- dec r4d
- jg .loop
- REP_RET
-
-%endmacro ; PLANE_INTERLEAVE
-
-INIT_MMX
-PLANE_INTERLEAVE mmxext
-INIT_XMM
-PLANE_INTERLEAVE sse2
-
-%endif ; HIGH_BIT_DEPTH
-
-%ifndef HIGH_BIT_DEPTH
-%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
movq m0, [%2]
%if mmsize==16
%ifidn %4, a
mov%5a [%1], m0
%else
movq m1, [%3]
- mova m2, m0
+ punpckhbw m2, m0, m1
punpcklbw m0, m1
- punpckhbw m2, m1
- mov%5a [%1], m0
+ mov%5a [%1+0], m0
mov%5a [%1+8], m2
%endif
+%endif ; HIGH_BIT_DEPTH
%endmacro
-%endif
-%macro DEINTERLEAVE 7 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant, is aligned
+%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
%ifdef HIGH_BIT_DEPTH
%assign n 0
%rep 16/mmsize
mova m0, [%3+(n+0)*mmsize]
mova m1, [%3+(n+1)*mmsize]
- mova m2, m0
- mova m3, m1
- pand m0, %6
- pand m1, %6
- psrld m2, 16
- psrld m3, 16
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+ pand m0, %5
+ pand m1, %5
packssdw m0, m1
packssdw m2, m3
- mov%7 [%1+(n/2)*mmsize], m0
- mov%7 [%2+(n/2)*mmsize], m2
+ mov%6 [%1+(n/2)*mmsize], m0
+ mov%6 [%2+(n/2)*mmsize], m2
%assign n (n+2)
%endrep
%else ; !HIGH_BIT_DEPTH
%if mmsize==16
mova m0, [%3]
-%ifidn %5, ssse3
- pshufb m0, %6
+%if cpuflag(ssse3)
+ pshufb m0, %5
%else
mova m1, m0
- pand m0, %6
+ pand m0, %5
psrlw m1, 8
packuswb m0, m1
%endif
mova m1, [%3+8]
mova m2, m0
mova m3, m1
- pand m0, %6
- pand m1, %6
+ pand m0, %5
+ pand m1, %5
psrlw m2, 8
psrlw m3, 8
packuswb m0, m1
%endif ; HIGH_BIT_DEPTH
%endmacro
-%ifndef HIGH_BIT_DEPTH
-%macro PLANE_INTERLEAVE 1
+%macro PLANE_INTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
; uint8_t *srcu, int i_srcu,
; uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core_%1, 6,7
- mov r6d, r6m
+cglobal plane_copy_interleave_core, 7,7
+ FIX_STRIDES r1d, r3d, r5d, r6d
+%ifdef HIGH_BIT_DEPTH
+ mov r1m, r1d
+ mov r3m, r3d
+ mov r6m, r6d
+%endif
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
movsxdifnidn r5, r5d
+ movsxdifnidn r6, r6d
lea r0, [r0+r6*2]
add r2, r6
add r4, r6
%else
DECLARE_REG_TMP 1,3
%endif
+ mov t1, r1
+ shr t1, SIZEOF_PIXEL
+ sub t1, r6
mov t0d, r7m
- mov t1d, r1d
- shr t1d, 1
- sub t1d, r6d
.loopy:
mov r6d, r6m
neg r6
mov r6d, r6m
neg r6
.loopx:
- INTERLEAVE r0+r6*2, r2+r6, r4+r6, u, nt
- INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
- add r6, 16
+ INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
+ INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
+ add r6, 16*SIZEOF_PIXEL
jl .loopx
.pad:
+%assign n 0
+%rep SIZEOF_PIXEL
%if mmsize==8
- movntq [r0+r6*2], m0
- movntq [r0+r6*2+8], m0
- movntq [r0+r6*2+16], m0
- movntq [r0+r6*2+24], m0
+ movntq [r0+r6*2+(n+ 0)], m0
+ movntq [r0+r6*2+(n+ 8)], m0
+ movntq [r0+r6*2+(n+16)], m0
+ movntq [r0+r6*2+(n+24)], m0
%else
- movntdq [r0+r6*2], m0
- movntdq [r0+r6*2+16], m0
+ movntdq [r0+r6*2+(n+ 0)], m0
+ movntdq [r0+r6*2+(n+16)], m0
%endif
- add r6, 16
+ %assign n n+32
+%endrep
+ add r6, 16*SIZEOF_PIXEL
cmp r6, t1
jl .pad
add r0, r1mp
;-----------------------------------------------------------------------------
; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2_%1, 4,5
+cglobal store_interleave_8x8x2, 4,5
mov r4d, 4
+ FIX_STRIDES r1d
.loop:
- INTERLEAVE r0, r2, r3, a
- INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
- add r2, FDEC_STRIDE*2
- add r3, FDEC_STRIDE*2
+ INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
+ INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
+ add r2, FDEC_STRIDEB*2
+ add r3, FDEC_STRIDEB*2
lea r0, [r0+r1*2]
dec r4d
jg .loop
REP_RET
%endmacro ; PLANE_INTERLEAVE
-%endif ; !HIGH_BIT_DEPTH
-%macro DEINTERLEAVE_START 1
+%macro DEINTERLEAVE_START 0
%ifdef HIGH_BIT_DEPTH
mova m4, [pd_ffff]
-%elifidn %1, ssse3
+%elif cpuflag(ssse3)
mova m4, [deinterleave_shuf]
%else
mova m4, [pw_00ff]
%endif ; HIGH_BIT_DEPTH
%endmacro
-%macro PLANE_DEINTERLEAVE 1
+%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
; pixel *dstv, int i_dstv,
; pixel *src, int i_src, int w, int h )
;-----------------------------------------------------------------------------
-cglobal plane_copy_deinterleave_%1, 6,7
- DEINTERLEAVE_START %1
+cglobal plane_copy_deinterleave, 6,7
+ DEINTERLEAVE_START
mov r6d, r6m
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r6d, r6m
neg r6
.loopx:
- DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, %1, m4, u
- DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, %1, m4, u
+ DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
+ DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
add r6, 16*SIZEOF_PIXEL
jl .loopx
add r0, r1
;-----------------------------------------------------------------------------
; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
- DEINTERLEAVE_START %1
+cglobal load_deinterleave_8x8x2_fenc, 3,4
+ DEINTERLEAVE_START
mov r3d, 4
FIX_STRIDES r2d
.loop:
- DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, %1, m4, a
- DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, %1, m4, a
+ DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
+ DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
add r0, FENC_STRIDEB*2
lea r1, [r1+r2*2]
dec r3d
;-----------------------------------------------------------------------------
; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
- DEINTERLEAVE_START %1
+cglobal load_deinterleave_8x8x2_fdec, 3,4
+ DEINTERLEAVE_START
mov r3d, 4
FIX_STRIDES r2d
.loop:
- DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, %1, m4, a
- DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, %1, m4, a
+ DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
+ DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
add r0, FDEC_STRIDEB*2
lea r1, [r1+r2*2]
dec r3d
%endmacro ; PLANE_DEINTERLEAVE
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-PLANE_DEINTERLEAVE mmx
-INIT_XMM
-PLANE_DEINTERLEAVE sse2
+INIT_MMX mmx2
+PLANE_INTERLEAVE
+INIT_MMX mmx
+PLANE_DEINTERLEAVE
+INIT_XMM sse2
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+INIT_XMM avx
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
%else
-INIT_MMX
-PLANE_INTERLEAVE mmxext
-PLANE_DEINTERLEAVE mmx
-INIT_XMM
-PLANE_INTERLEAVE sse2
-PLANE_DEINTERLEAVE sse2
-PLANE_DEINTERLEAVE ssse3
+INIT_MMX mmx2
+PLANE_INTERLEAVE
+INIT_MMX mmx
+PLANE_DEINTERLEAVE
+INIT_XMM sse2
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+INIT_XMM ssse3
+PLANE_DEINTERLEAVE
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
-; but they also will fail if given a non-mod16 size or a size less than 64.
+; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
+INIT_MMX
cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
- jz .copy32
+ jz .copy32start
sub r2d, 16
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
+.copy32start
+ test r2d, r2d
+ jz .ret
.copy32:
sub r2d, 32
movq mm0, [r1 + r2 + 0]
movq [r0 + r2 + 16], mm2
movq [r0 + r2 + 24], mm3
jg .copy32
+.ret
REP_RET
;-----------------------------------------------------------------------------
movdqa [r0 + r2], xmm0
.copy32:
test r2d, 32
- jz .copy64
+ jz .copy64start
sub r2d, 32
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
+.copy64start
+ test r2d, r2d
+ jz .ret
.copy64:
sub r2d, 64
movdqa xmm0, [r1 + r2 + 0]
movdqa xmm3, [r1 + r2 + 48]
movdqa [r0 + r2 + 48], xmm3
jg .copy64
+.ret:
REP_RET
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
-%macro MEMZERO 1
-cglobal memzero_aligned_%1, 2,2
+%macro MEMZERO 0
+cglobal memzero_aligned, 2,2
add r0, r1
neg r1
pxor m0, m0
REP_RET
%endmacro
-INIT_MMX
-MEMZERO mmx
-INIT_XMM
-MEMZERO sse2
+INIT_MMX mmx
+MEMZERO
+INIT_XMM sse2
+MEMZERO
+%ifndef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
+INIT_XMM
cglobal integral_init4h_sse4, 3,4
lea r3, [r0+r2*2]
add r1, r2
jl .loop
REP_RET
-cglobal integral_init8h_sse4, 3,4
+%macro INTEGRAL_INIT8H 0
+cglobal integral_init8h, 3,4
lea r3, [r0+r2*2]
add r1, r2
neg r2
movdqa m0, [r1+r2]
movdqa m1, [r1+r2+16]
palignr m1, m0, 8
- movdqa m2, m0
- movdqa m3, m1
+ mpsadbw m2, m0, m4, 4
+ mpsadbw m3, m1, m4, 4
mpsadbw m0, m4, 0
mpsadbw m1, m4, 0
- mpsadbw m2, m4, 4
- mpsadbw m3, m4, 4
paddw m0, [r0+r2*2]
paddw m1, [r0+r2*2+16]
paddw m0, m2
add r2, 16
jl .loop
REP_RET
+%endmacro
+
+INIT_XMM sse4
+INTEGRAL_INIT8H
+INIT_XMM avx
+INTEGRAL_INIT8H
+%endif ; !HIGH_BIT_DEPTH
-%macro INTEGRAL_INIT_8V 1
+%macro INTEGRAL_INIT_8V 0
;-----------------------------------------------------------------------------
; void integral_init8v( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
-cglobal integral_init8v_%1, 3,3
+cglobal integral_init8v, 3,3
shl r1, 1
add r0, r1
lea r2, [r0+r1*8]
REP_RET
%endmacro
-INIT_MMX
-INTEGRAL_INIT_8V mmx
-INIT_XMM
-INTEGRAL_INIT_8V sse2
+INIT_MMX mmx
+INTEGRAL_INIT_8V
+INIT_XMM sse2
+INTEGRAL_INIT_8V
;-----------------------------------------------------------------------------
; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
PALIGNR %2, %4, 1, m6
pavgb %1, %3
pavgb %2, %4
- mova %5, %1
- mova %6, %2
+ psrlw %5, %1, 8
+ psrlw %6, %2, 8
pand %1, m7
pand %2, m7
- psrlw %5, 8
- psrlw %6, 8
%endmacro
%macro FILT16x2 4
pavgb %1, m3
PALIGNR m3, m2, 1, m6
pavgb m3, m2
- mova m5, m3
- mova m4, %1
+ psrlw m5, m3, 8
+ psrlw m4, %1, 8
pand m3, m7
pand %1, m7
- psrlw m5, 8
- psrlw m4, 8
packuswb m3, %1
packuswb m5, m4
mova [%2], m3
pavgb m0, [r0+%3+r5+1]
pavgb m1, m3
pavgb m0, m2
- mova m3, m1
- mova m2, m0
+ psrlw m3, m1, 8
+ psrlw m2, m0, 8
pand m1, m7
pand m0, m7
- psrlw m3, 8
- psrlw m2, 8
packuswb m0, m1
packuswb m2, m3
mova [%1], m0
mova [%2], m2
%endmacro
+%macro FILT8xU 3
+ mova m3, [r0+%3+8]
+ mova m2, [r0+%3]
+ pavgw m3, [r0+%3+r5+8]
+ pavgw m2, [r0+%3+r5]
+ movu m1, [r0+%3+10]
+ movu m0, [r0+%3+2]
+ pavgw m1, [r0+%3+r5+10]
+ pavgw m0, [r0+%3+r5+2]
+ pavgw m1, m3
+ pavgw m0, m2
+ psrld m3, m1, 16
+ psrld m2, m0, 16
+ pand m1, m7
+ pand m0, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ movu [%1], m0
+ mova [%2], m2
+%endmacro
+
+%macro FILT8xA 4
+ mova m3, [r0+%4+mmsize]
+ mova m2, [r0+%4]
+ pavgw m3, [r0+%4+r5+mmsize]
+ pavgw m2, [r0+%4+r5]
+ PALIGNR %1, m3, 2, m6
+ pavgw %1, m3
+ PALIGNR m3, m2, 2, m6
+ pavgw m3, m2
+ psrld m5, m3, 16
+ psrld m4, %1, 16
+ pand m3, m7
+ pand %1, m7
+ packssdw m3, %1
+ packssdw m5, m4
+ mova [%2], m3
+ mova [%3], m5
+ mova %1, m2
+%endmacro
+
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal frame_init_lowres_core_%1, 6,7,%2
+%macro FRAME_INIT_LOWRES 0
+cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%ifdef HIGH_BIT_DEPTH
+ shl dword r6m, 1
+ FIX_STRIDES r5d
+ shl dword r7m, 1
+%endif
%ifdef WIN64
- movsxd r5, r5d
+ movsxd r5, r5d
%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
shl r6d, 1
PUSH r6
%define src_gap [rsp]
+%ifdef HIGH_BIT_DEPTH
+ pcmpeqw m7, m7
+ psrld m7, 16
+.vloop:
+ mov r6d, r7m
+%ifnidn cpuname, mmx2
+ mova m0, [r0]
+ mova m1, [r0+r5]
+ pavgw m0, m1
+ pavgw m1, [r0+r5*2]
+%endif
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+ sub r2, mmsize
+ sub r3, mmsize
+ sub r4, mmsize
+%ifidn cpuname, mmx2
+ FILT8xU r1, r2, 0
+ FILT8xU r3, r4, r5
+%else
+ FILT8xA m0, r1, r2, 0
+ FILT8xA m1, r3, r4, r5
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%else ; !HIGH_BIT_DEPTH
%if mmsize == 16
; adjust for the odd end case
mov r6d, r7m
psrlw m7, 8
.vloop:
mov r6d, r7m
-%ifnidn %1, mmxext
+%ifnidn cpuname, mmx2
mova m0, [r0]
mova m1, [r0+r5]
pavgb m0, m1
mova m0, m2
mova m1, m3
sub r6d, 8
+ jz .skip
%endif ; mmsize
.hloop:
sub r0, mmsize*2
mova [r2], m4
mova [r3], m3
mova [r4], m5
-%elifidn %1, mmxext
+%elifidn cpuname, mmx2
FILT8x2U r1, r2, 0
FILT8x2U r3, r4, r5
%else
%endif
sub r6d, mmsize
jg .hloop
+%endif ; HIGH_BIT_DEPTH
.skip:
mov r6, dst_gap
sub r0, src_gap
RET
%endmacro ; FRAME_INIT_LOWRES
-INIT_MMX
-%define PALIGNR PALIGNR_MMX
-FRAME_INIT_LOWRES mmxext
+INIT_MMX mmx2
+FRAME_INIT_LOWRES
%ifndef ARCH_X86_64
-FRAME_INIT_LOWRES cache32_mmxext
+INIT_MMX cache32, mmx2
+FRAME_INIT_LOWRES
%endif
-INIT_XMM
-FRAME_INIT_LOWRES sse2, 12
-%define PALIGNR PALIGNR_SSSE3
-FRAME_INIT_LOWRES ssse3, 12
+INIT_XMM sse2
+FRAME_INIT_LOWRES
+INIT_XMM ssse3
+FRAME_INIT_LOWRES
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
-cglobal mbtree_propagate_cost_sse2, 6,6,7
- shl r5d, 1
- lea r0, [r0+r5*2]
- add r1, r5
- add r2, r5
- add r3, r5
- add r4, r5
- neg r5
- pxor xmm5, xmm5
- movdqa xmm6, [pw_3fff]
- movdqa xmm4, [pd_128]
+INIT_XMM
+cglobal mbtree_propagate_cost_sse2, 7,7,7
+ add r6d, r6d
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ pxor xmm4, xmm4
+ movss xmm6, [r5]
+ shufps xmm6, xmm6, 0
+ mulps xmm6, [pf_inv256]
+ movdqa xmm5, [pw_3fff]
.loop:
- movq xmm2, [r2+r5] ; intra
- movq xmm0, [r4+r5] ; invq
- movq xmm3, [r3+r5] ; inter
- movq xmm1, [r1+r5] ; prop
- punpcklwd xmm2, xmm5
- punpcklwd xmm0, xmm5
+ movq xmm2, [r2+r6] ; intra
+ movq xmm0, [r4+r6] ; invq
+ movq xmm3, [r3+r6] ; inter
+ movq xmm1, [r1+r6] ; prop
+ punpcklwd xmm2, xmm4
+ punpcklwd xmm0, xmm4
pmaddwd xmm0, xmm2
- pand xmm3, xmm6
- punpcklwd xmm1, xmm5
- punpcklwd xmm3, xmm5
- paddd xmm0, xmm4
- psrld xmm0, 8 ; intra*invq>>8
- paddd xmm0, xmm1 ; prop + (intra*invq>>8)
+ pand xmm3, xmm5
+ punpcklwd xmm1, xmm4
+ punpcklwd xmm3, xmm4
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
+ cvtdq2ps xmm1, xmm1 ; prop
+ addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
cvtdq2ps xmm1, xmm2 ; intra
psubd xmm2, xmm3 ; intra - inter
+ cvtdq2ps xmm2, xmm2 ; intra - inter
rcpps xmm3, xmm1 ; 1 / intra 1st approximation
- cvtdq2ps xmm0, xmm0
mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
- cvtdq2ps xmm2, xmm2
mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
- mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
+ mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
subps xmm3, xmm1 ; 2nd approximation for 1/intra
mulps xmm0, xmm3 ; / intra
- cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
- movdqa [r0+r5*2], xmm0
- add r5, 8
+ cvtps2dq xmm0, xmm0
+ movdqa [r0+r6*2], xmm0
+ add r6, 8
jl .loop
REP_RET
+%macro INT16_TO_FLOAT 1
+ vpunpckhwd xmm4, xmm%1, xmm7
+ vpunpcklwd xmm%1, xmm7
+ vinsertf128 ymm%1, ymm%1, xmm4, 1
+ vcvtdq2ps ymm%1, ymm%1
+%endmacro
+
+; FIXME: align loads/stores to 16 bytes
+cglobal mbtree_propagate_cost_avx, 7,7,8
+ add r6d, r6d
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ vmovdqa xmm5, [pw_3fff]
+ vbroadcastss ymm6, [r5]
+ vmulps ymm6, ymm6, [pf_inv256]
+ vpxor xmm7, xmm7
+.loop:
+ vmovdqu xmm0, [r2+r6] ; intra
+ vmovdqu xmm1, [r4+r6] ; invq
+ vmovdqu xmm2, [r1+r6] ; prop
+ vpand xmm3, xmm5, [r3+r6] ; inter
+ INT16_TO_FLOAT 0
+ INT16_TO_FLOAT 1
+ INT16_TO_FLOAT 2
+ INT16_TO_FLOAT 3
+ vmulps ymm1, ymm1, ymm0
+ vsubps ymm4, ymm0, ymm3
+ vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
+ vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
+ vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
+ vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
+ vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
+ vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
+ vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
+ vmulps ymm1, ymm1, ymm3 ; / intra
+ vcvtps2dq ymm1, ymm1
+ vmovdqu [r0+r6*2], ymm1
+ add r6, 16
+ jl .loop
+ vzeroupper
+ RET