filt_mul51: times 8 db -5, 1
hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+%if HIGH_BIT_DEPTH
+deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
+deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
+%else
+deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
+deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
+%endif
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
cglobal hpel_filter_v, 5,6,11
- FIX_STRIDES r3d, r4d
-%if WIN64
- movsxd r4, r4d
-%endif
+ FIX_STRIDES r3, r4
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
%define s30 [pad30]
%endif
add r0, r4
- lea r2, [r2+r4]
+ add r2, r4
neg r4
mova m7, [pw_pixel_max]
pxor m0, m0
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,10
add r2, r2
add r0, r2
- lea r1, [r1+r2]
+ add r1, r2
neg r2
mova m0, [tap1]
mova m7, [tap3]
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
+; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h, 3,4,8
%define src r1+r2
%if HIGH_BIT_DEPTH == 0
%macro HPEL_V 1
;-----------------------------------------------------------------------------
-; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width );
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_v, 5,6,%1
-%if WIN64
- movsxd r4, r4d
-%endif
lea r5, [r1+r3]
sub r1, r3
sub r1, r3
%endmacro
;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
INIT_MMX
cglobal hpel_filter_c_mmx2, 3,3
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_mmx2, 3,3
add r0, r2
%macro HPEL_C 0
;-----------------------------------------------------------------------------
-; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_c, 3,3,9
add r0, r2
%endmacro
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
cglobal hpel_filter_h_sse2, 3,3,8
add r0, r2
REP_RET
;-----------------------------------------------------------------------------
-; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
;-----------------------------------------------------------------------------
%macro HPEL_H 0
cglobal hpel_filter_h, 3,3
%macro HPEL 0
;-----------------------------------------------------------------------------
; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; uint8_t *src, int stride, int width, int height)
+; uint8_t *src, intptr_t stride, int width, int height )
;-----------------------------------------------------------------------------
cglobal hpel_filter, 7,9,16
-%if WIN64
- movsxd r4, r4d
- movsxd r5, r5d
-%endif
mov r7, r3
- sub r5, 16
+ sub r5d, 16
mov r8, r1
and r7, 15
sub r3, r7
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void plane_copy_core( pixel *dst, int i_dst,
-; pixel *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, intptr_t i_dst,
+; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
INIT_MMX
cglobal plane_copy_core_mmx2, 6,7
- FIX_STRIDES r1d, r3d, r4d
- movsxdifnidn r1, r1d
- movsxdifnidn r3, r3d
+ FIX_STRIDES r1, r3, r4d
+%if HIGH_BIT_DEPTH == 0
movsxdifnidn r4, r4d
+%endif
sub r1, r4
sub r3, r4
.loopy:
- mov r6d, r4d
- sub r6d, 63
+ lea r6d, [r4-63]
.loopx:
prefetchnta [r2+256]
movq m0, [r2 ]
%macro PLANE_INTERLEAVE 0
;-----------------------------------------------------------------------------
-; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
-; uint8_t *srcu, int i_srcu,
-; uint8_t *srcv, int i_srcv, int w, int h )
+; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
+; uint8_t *srcu, intptr_t i_srcu,
+; uint8_t *srcv, intptr_t i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
-cglobal plane_copy_interleave_core, 7,9
- FIX_STRIDES r1d, r3d, r5d, r6d
+cglobal plane_copy_interleave_core, 6,9
+ mov r6d, r6m
%if HIGH_BIT_DEPTH
- mov r1m, r1d
- mov r3m, r3d
- mov r6m, r6d
+ FIX_STRIDES r1, r3, r5, r6d
+ movifnidn r1mp, r1
+ movifnidn r3mp, r3
+ mov r6m, r6d
%endif
- movsxdifnidn r1, r1d
- movsxdifnidn r3, r3d
- movsxdifnidn r5, r5d
- movsxdifnidn r6, r6d
lea r0, [r0+r6*2]
add r2, r6
add r4, r6
RET
;-----------------------------------------------------------------------------
-; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
+; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
;-----------------------------------------------------------------------------
cglobal store_interleave_chroma, 5,5
- FIX_STRIDES r1d
+ FIX_STRIDES r1
.loop:
INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
-; void plane_copy_deinterleave( pixel *dstu, int i_dstu,
-; pixel *dstv, int i_dstv,
-; pixel *src, int i_src, int w, int h )
+; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
+; pixel *dstv, intptr_t i_dstv,
+; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
cglobal plane_copy_deinterleave, 6,7
DEINTERLEAVE_START
mov r6d, r6m
- FIX_STRIDES r1d, r3d, r5d, r6d
+ FIX_STRIDES r1, r3, r5, r6d
%if HIGH_BIT_DEPTH
mov r6m, r6d
%endif
- movsxdifnidn r1, r1d
- movsxdifnidn r3, r3d
- movsxdifnidn r5, r5d
add r0, r6
add r2, r6
lea r4, [r4+r6*2]
REP_RET
;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fenc, 4,4
DEINTERLEAVE_START
- FIX_STRIDES r2d
+ FIX_STRIDES r2
.loop:
DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
REP_RET
;-----------------------------------------------------------------------------
-; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
;-----------------------------------------------------------------------------
cglobal load_deinterleave_chroma_fdec, 4,4
DEINTERLEAVE_START
- FIX_STRIDES r2d
+ FIX_STRIDES r2
.loop:
DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
-; void integral_init4h( uint16_t *sum, uint8_t *pix, int stride )
+; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal integral_init4h_sse4, 3,4
%macro INTEGRAL_INIT_8V 0
;-----------------------------------------------------------------------------
-; void integral_init8v( uint16_t *sum8, int stride )
+; void integral_init8v( uint16_t *sum8, intptr_t stride )
;-----------------------------------------------------------------------------
cglobal integral_init8v, 3,3
shl r1, 1
INTEGRAL_INIT_8V
;-----------------------------------------------------------------------------
-; void integral_init4v( uint16_t *sum8, uint16_t *sum4, int stride )
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal integral_init4v_mmx, 3,5
pavgb %4, [r0+r5*2+%7]
PALIGNR %1, %3, 1, m6
PALIGNR %2, %4, 1, m6
+%if cpuflag(xop)
+ pavgb %1, %3
+ pavgb %2, %4
+%else
pavgb %1, %3
pavgb %2, %4
psrlw %5, %1, 8
psrlw %6, %2, 8
pand %1, m7
pand %2, m7
+%endif
%endmacro
%macro FILT16x2 4
pavgb %1, m3
PALIGNR m3, m2, 1, m6
pavgb m3, m2
+%if cpuflag(xop)
+ vpperm m5, m3, %1, m7
+ vpperm m3, m3, %1, m6
+%else
psrlw m5, m3, 8
psrlw m4, %1, 8
pand m3, m7
pand %1, m7
packuswb m3, %1
packuswb m5, m4
+%endif
mova [%2], m3
mova [%3], m5
mova %1, m2
pavgw %1, m3
PALIGNR m3, m2, 2, m6
pavgw m3, m2
+%if cpuflag(xop)
+ vpperm m5, m3, %1, m7
+ vpperm m3, m3, %1, m6
+%else
psrld m5, m3, 16
psrld m4, %1, 16
pand m3, m7
pand %1, m7
packssdw m3, %1
packssdw m5, m4
+%endif
mova [%2], m3
mova [%3], m5
mova %1, m2
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
-; int src_stride, int dst_stride, int width, int height )
+; intptr_t src_stride, intptr_t dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
%if HIGH_BIT_DEPTH
shl dword r6m, 1
- FIX_STRIDES r5d
+ FIX_STRIDES r5
shl dword r7m, 1
-%endif
-%if WIN64
- movsxd r5, r5d
%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
PUSH r6
%define src_gap [rsp]
%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
pcmpeqw m7, m7
psrld m7, 16
+%endif
.vloop:
mov r6d, r7m
%ifnidn cpuname, mmx2
sub r4, r6
add dst_gap, r6d
%endif ; mmsize
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
pcmpeqb m7, m7
psrlw m7, 8
+%endif
.vloop:
mov r6d, r7m
%ifnidn cpuname, mmx2
jz .hloop
sub r0, 16
FILT8x4 m0, m1, m2, m3, m4, m5, 0
+%if cpuflag(xop)
+ mova m4, m0
+ vpperm m0, m4, m1, m6
+ vpperm m1, m4, m1, m7
+ movq [r1], m0
+ movq [r2], m1
+ movhps [r3], m0
+ movhps [r4], m1
+%else
packuswb m0, m4
packuswb m1, m5
movq [r1], m0
movhps [r2], m0
movq [r3], m1
movhps [r4], m1
+%endif
mova m0, m2
mova m1, m3
sub r6d, 8
mova m8, m0
mova m9, m1
FILT8x4 m2, m3, m0, m1, m4, m5, 0
+%if cpuflag(xop)
+ vpperm m4, m2, m8, m7
+ vpperm m2, m2, m8, m6
+ vpperm m5, m3, m9, m7
+ vpperm m3, m3, m9, m6
+%else
packuswb m2, m8
packuswb m3, m9
packuswb m4, m10
packuswb m5, m11
+%endif
mova [r1], m2
mova [r2], m4
mova [r3], m3
FRAME_INIT_LOWRES
INIT_XMM ssse3
FRAME_INIT_LOWRES
+INIT_XMM avx
+FRAME_INIT_LOWRES
+INIT_XMM xop
+FRAME_INIT_LOWRES
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
vmovdqu [r0+r6*2], ymm1
add r6, 16
jl .loop
- vzeroupper
- RET
+ REP_RET