;*****************************************************************************
;* mc-a.asm: x86 motion compensation
;*****************************************************************************
-;* Copyright (C) 2003-2011 x264 project
+;* Copyright (C) 2003-2016 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
SECTION_RODATA 32
-ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
ch_shuf_adj: times 8 db 0
times 8 db 2
times 8 db 4
cextern pw_8
cextern pw_32
cextern pw_64
+cextern pw_512
cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
cextern pd_32
+cextern deinterleave_shufd
;=============================================================================
; implicit weighted biprediction
;=============================================================================
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
-%ifdef ARCH_X86_64
- DECLARE_REG_TMP 0,1,2,3,4,5,10,11
+%if WIN64
+ DECLARE_REG_TMP 0,1,2,3,4,5,4,5
%macro AVG_START 0-1 0
PROLOGUE 6,7,%1
-%ifdef WIN64
- movsxd r5, r5d
-%endif
+ %endmacro
+%elif UNIX64
+ DECLARE_REG_TMP 0,1,2,3,4,5,7,8
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,9,%1
%endmacro
%else
DECLARE_REG_TMP 1,2,3,4,5,6,1,2
%endif
%macro AVG_END 0
- sub eax, 2
lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
+ sub eax, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro BIWEIGHT_MMX 2
movh m0, %1
movh m1, %2
punpcklbw m0, m1
pmaddubsw m0, m3
- paddw m0, m4
- psraw m0, 6
+ pmulhrsw m0, m4
%endmacro
%macro BIWEIGHT_START_SSSE3 0
sub t7d, t6d
shl t7d, 8
add t6d, t7d
- movd m3, t6d
- mova m4, [pw_32]
+ mova m4, [pw_512]
+ movd xm3, t6d
+%if cpuflag(avx2)
+ vpbroadcastw m3, xm3
+%else
SPLATW m3, m3 ; weight_dst,src
+%endif
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro BIWEIGHT_ROW 4
BIWEIGHT [%2], [%3]
%if %4==mmsize/4
%endif ;HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
+; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 1-2 0
cglobal pixel_avg_weight_w%1
BIWEIGHT_START
AVG_START %2
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
mova m7, [pw_pixel_max]
%endif
.height_loop:
BIWEIGHT [t2], [t4]
SWAP 0, 6
BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
packssdw m6, m0
CLIPW m6, m5, m7
%else ;!HIGH_BIT_DEPTH
AVG_WEIGHT 4
AVG_WEIGHT 8
AVG_WEIGHT 16
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
AVG_WEIGHT 4, 8
AVG_WEIGHT 8, 8
INIT_XMM ssse3
AVG_WEIGHT 8, 7
AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 m0, m0, [t2+t3], 1
+ vinserti128 m1, m1, [t4+t5], 1
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], m0, 1
+ AVG_END
%endif ;HIGH_BIT_DEPTH
;=============================================================================
; P frame explicit weighted prediction
;=============================================================================
-%ifdef HIGH_BIT_DEPTH
-%macro WEIGHT_START 1 ; (width)
+%if HIGH_BIT_DEPTH
+; width
+%macro WEIGHT_START 1
mova m0, [r4+ 0] ; 1<<denom
mova m3, [r4+16]
movd m2, [r4+32] ; denom
paddw m2, [sq_1] ; denom+1
%endmacro
-%macro WEIGHT 2 ; (src1, src2)
+; src1, src2
+%macro WEIGHT 2
movh m5, [%1]
movh m6, [%2]
punpcklwd m5, m0
packssdw m5, m6
%endmacro
-%macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
+; src, dst, width
+%macro WEIGHT_TWO_ROW 4
%assign x 0
%rep (%3+mmsize/2-1)/(mmsize/2)
%if %3-x/2 <= 4 && mmsize == 16
%else ; !HIGH_BIT_DEPTH
%macro WEIGHT_START 1
+%if cpuflag(avx2)
+ vbroadcasti128 m3, [r4]
+ vbroadcasti128 m4, [r4+16]
+%else
mova m3, [r4]
- mova m6, [r4+16]
+ mova m4, [r4+16]
+%if notcpuflag(ssse3)
movd m5, [r4+32]
- pxor m2, m2
-%if (%1 == 20 || %1 == 12) && mmsize == 16
- movdq2q mm3, xmm3
- movdq2q mm4, xmm4
- movdq2q mm5, xmm5
- movdq2q mm6, xmm6
- pxor mm2, mm2
%endif
-%endmacro
-
-%macro WEIGHT_START_SSSE3 1
- mova m3, [r4]
- mova m4, [r4+16]
- pxor m2, m2
-%if %1 == 20 || %1 == 12
- movdq2q mm3, xmm3
- movdq2q mm4, xmm4
- pxor mm2, mm2
%endif
+ pxor m2, m2
%endmacro
-;; macro to weight mmsize bytes taking half from %1 and half from %2
-%macro WEIGHT 2 ; (src1,src2)
- movh m0, [%1]
- movh m1, [%2]
- punpcklbw m0, m2 ;setup
- punpcklbw m1, m2 ;setup
- pmullw m0, m3 ;scale
- pmullw m1, m3 ;scale
- paddsw m0, m6 ;1<<(denom-1)+(offset<<denom)
- paddsw m1, m6 ;1<<(denom-1)+(offset<<denom)
- psraw m0, m5 ;denom
- psraw m1, m5 ;denom
-%endmacro
-
-%macro WEIGHT_SSSE3 2
- movh m0, [%1]
- movh m1, [%2]
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
+ movh m0, [%1 ]
+ movh m1, [%1+mmsize/2]
+ movh m6, [%2 ]
+ movh m7, [%2+mmsize/2]
punpcklbw m0, m2
punpcklbw m1, m2
+ punpcklbw m6, m2
+ punpcklbw m7, m2
+%if cpuflag(ssse3)
+%if %5==0
psllw m0, 7
psllw m1, 7
+ psllw m6, 7
+ psllw m7, 7
+%endif
pmulhrsw m0, m3
pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ pmulhrsw m7, m3
paddw m0, m4
paddw m1, m4
+ paddw m6, m4
+ paddw m7, m4
+%else
+ pmullw m0, m3
+ pmullw m1, m3
+ pmullw m6, m3
+ pmullw m7, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ paddsw m6, m4
+ paddsw m7, m4
+ psraw m0, m5
+ psraw m1, m5
+ psraw m6, m5
+ psraw m7, m5
+%endif
+ packuswb m0, m1
+ packuswb m6, m7
+ mova [%3], m0
+ mova [%4], m6
%endmacro
-%macro WEIGHT_SAVE_ROW 3 ;(src,dst,width)
-%if %3 == 16
- mova [%2], %1
-%elif %3 == 8
- movq [%2], %1
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+ movu xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m0, m2
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ mova [%3], xm0
+ vextracti128 [%4], m0, 1
%else
- movd [%2], %1 ; width 2 can write garbage for last 2 bytes
+ movq xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpcklbw m0, m2
+%if %6==0
+ psllw m0, 7
%endif
-%endmacro
-
-%macro WEIGHT_ROW 3 ; (src,dst,width)
- ;; load weights
- WEIGHT %1, (%1+(mmsize/2))
- packuswb m0, m1 ;put bytes into m0
- WEIGHT_SAVE_ROW m0, %2, %3
-%endmacro
-
-%macro WEIGHT_SAVE_COL 2 ;(dst,size)
-%if %2 == 8
- packuswb m0, m1
- movq [%1], m0
- movhps [%1+r1], m0
+ pmulhrsw m0, m3
+ paddw m0, m4
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+%if %5 == 8
+ movq [%3], xm0
+ movq [%4], xm1
%else
- packuswb m0, m0
- packuswb m1, m1
- movd [%1], m0 ; width 2 can write garbage for last 2 bytes
- movd [%1+r1], m1
+ movd [%3], xm0
+ movd [%4], xm1
%endif
-%endmacro
-
-%macro WEIGHT_COL 3 ; (src,dst,width)
-%if %3 <= 4 && mmsize == 16
- INIT_MMX
- ;; load weights
- WEIGHT %1, (%1+r3)
- WEIGHT_SAVE_COL %2, %3
- INIT_XMM
+%endif
+%else
+ movh m0, [%1]
+ movh m1, [%2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+%if cpuflag(ssse3)
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
%else
- WEIGHT %1, (%1+r3)
- WEIGHT_SAVE_COL %2, %3
+ pmullw m0, m3
+ pmullw m1, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ psraw m0, m5
+ psraw m1, m5
+%endif
+%if %5 == 8
+ packuswb m0, m1
+ movh [%3], m0
+ movhps [%4], m0
+%else
+ packuswb m0, m0
+ packuswb m1, m1
+ movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
+ movd [%4], m1
+%endif
%endif
-
%endmacro
-
-%macro WEIGHT_TWO_ROW 3 ; (src,dst,width)
+; src, dst, width
+%macro WEIGHT_TWO_ROW 4
%assign x 0
%rep %3
%if (%3-x) >= mmsize
- WEIGHT_ROW (%1+x), (%2+x), mmsize ; weight 1 mmsize
- WEIGHT_ROW (%1+r3+x), (%2+r1+x), mmsize ; weight 1 mmsize
+ WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
%assign x (x+mmsize)
%else
- WEIGHT_COL (%1+x),(%2+x),(%3-x)
- %exitrep
+ %assign w %3-x
+%if w == 20
+ %assign w 16
+%endif
+ WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
+ %assign x (x+w)
%endif
%if x >= %3
%exitrep
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
+;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
-%ifdef ARCH_X86_64
-%define NUMREGS 6
-%define LOAD_HEIGHT
-%define HEIGHT_REG r5d
-%define TMP_REG r6d
-%else
-%define NUMREGS 5
-%define TMP_REG r5d
-%define LOAD_HEIGHT mov r4d, r5m
-%define HEIGHT_REG r4d
-%endif
-
-%assign XMMREGS 7
-%ifdef HIGH_BIT_DEPTH
-%assign NUMREGS NUMREGS+1
-%assign XMMREGS 8
-%endif
-
%macro WEIGHTER 1
- cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
+cglobal mc_weight_w%1, 6,6,8
FIX_STRIDES r1, r3
WEIGHT_START %1
- LOAD_HEIGHT
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+ ; we can merge the shift step into the scale factor
+ ; if (m3<<7) doesn't overflow an int16_t
+ cmp byte [r4+1], 0
+ jz .fast
+%endif
.loop:
- WEIGHT_TWO_ROW r2, r0, %1
+ WEIGHT_TWO_ROW r2, r0, %1, 0
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub HEIGHT_REG, 2
+ sub r5d, 2
jg .loop
- REP_RET
+ RET
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+.fast:
+ psllw m3, 7
+.fastloop:
+ WEIGHT_TWO_ROW r2, r0, %1, 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .fastloop
+ RET
+%endif
%endmacro
INIT_MMX mmx2
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
WEIGHTER 12
-INIT_XMM avx
-WEIGHTER 8
-WEIGHTER 12
-WEIGHTER 16
-WEIGHTER 20
%else
-%define WEIGHT WEIGHT_SSSE3
-%define WEIGHT_START WEIGHT_START_SSSE3
INIT_MMX ssse3
WEIGHTER 4
INIT_XMM ssse3
WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
-INIT_XMM avx
-WEIGHTER 8
+INIT_YMM avx2
+WEIGHTER 8
WEIGHTER 16
WEIGHTER 20
%endif
%macro OFFSET_OP 7
mov%6 m0, [%1]
mov%6 m1, [%2]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
p%5usw m0, m2
p%5usw m1, m2
%ifidn %5,add
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
%assign x (x+mmsize)
%else
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
%else
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
%endmacro
;-----------------------------------------------------------------------------
-;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
+;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 2
- cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
+cglobal mc_offset%2_w%1, 6,6
FIX_STRIDES r1, r3
mova m2, [r4]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%ifidn %2,add
mova m3, [pw_pixel_max]
%endif
%endif
- LOAD_HEIGHT
.loop:
OFFSET_TWO_ROW r2, r0, %1, %2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub HEIGHT_REG, 2
+ sub r5d, 2
jg .loop
- REP_RET
+ RET
%endmacro
%macro OFFSETPN 1
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
-INIT_XMM avx
-OFFSETPN 12
-OFFSETPN 16
-OFFSETPN 20
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_XMM sse2
OFFSETPN 8
-INIT_XMM avx
-OFFSETPN 8
%endif
-%undef LOAD_HEIGHT
-%undef HEIGHT_REG
-%undef NUMREGS
-
;=============================================================================
;=============================================================================
;-----------------------------------------------------------------------------
-; void pixel_avg_4x4( pixel *dst, int dst_stride,
-; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
+; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+; pixel *src2, intptr_t src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 2
cglobal pixel_avg_%1x%2
mov eax, %2
cmp dword r6m, 32
jne pixel_avg_weight_w%1 %+ SUFFIX
+%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
+ jmp pixel_avg_w%1_avx2
+%else
%if mmsize == 16 && %1 == 16
test dword r4m, 15
jz pixel_avg_w%1_sse2
%endif
jmp pixel_avg_w%1_mmx2
+%endif
%endmacro
;-----------------------------------------------------------------------------
-; void pixel_avg_w4( pixel *dst, int dst_stride,
-; pixel *src1, int src1_stride, pixel *src2, int src2_stride,
-; int height, int weight );
+; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+; pixel *src2, intptr_t src2_stride, int height, int weight );
;-----------------------------------------------------------------------------
%macro AVG_FUNC 3
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
%2 m0, [t2+x]
%2 m1, [t2+x+SIZEOF_PIXEL*t3]
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
pavgw m0, [t4+x]
pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
%else ;!HIGH_BIT_DEPTH
AVG_END
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX mmx2
AVG_FUNC 4, movq, movq
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
INIT_XMM sse2
AVG_FUNC 4, movq, movq
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
INIT_MMX mmx2
AVG_FUNC 4, movd, movd
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
AVGH 8, 8
AVGH 8, 4
INIT_MMX ssse3
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
+INIT_XMM avx2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
%endif ;HIGH_BIT_DEPTH
; pixel avg2
;=============================================================================
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void pixel_avg2_wN( uint16_t *dst, int dst_stride,
-; uint16_t *src1, int src_stride,
+; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
+; uint16_t *src1, intptr_t src_stride,
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
.height_loop:
movu m0, [r2]
movu m1, [r2+r3*2]
-%if mmsize == 8
+%if cpuflag(avx) || mmsize == 8
pavgw m0, [r2+r4]
pavgw m1, [r2+r6]
%else
%endif
mova [r0], m0
mova [r0+r1*2], m1
- sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
+ sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
%macro AVG2_W_TWO 3
%3 [r0+mmsize], m1
mova [r0+r1*2], m2
%3 [r0+r1*2+mmsize], m3
- sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
+ sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
INIT_MMX mmx2
AVG2_W_ONE 8
AVG2_W_TWO 10, movd, movd
AVG2_W_TWO 16, movu, mova
+INIT_YMM avx2
+AVG2_W_ONE 16
INIT_MMX
cglobal pixel_avg2_w10_mmx2, 6,7
mova [r0+r1*2+ 0], m3
mova [r0+r1*2+ 8], m4
movh [r0+r1*2+16], m5
- sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
+ sub r5d, 2
jg .height_loop
- REP_RET
+ RET
cglobal pixel_avg2_w16_mmx2, 6,7
sub r4, r2
mova [r0+r1*2+ 8], m5
mova [r0+r1*2+16], m6
mova [r0+r1*2+24], m7
- sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
+ sub r5d, 2
jg .height_loop
- REP_RET
+ RET
cglobal pixel_avg2_w18_mmx2, 6,7
sub r4, r2
mova [r0+16], m2
mova [r0+24], m3
movh [r0+32], m4
- sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
+ dec r5d
jg .height_loop
- REP_RET
+ RET
-INIT_XMM
-cglobal pixel_avg2_w18_sse2, 6,7,6
+%macro PIXEL_AVG_W18 0
+cglobal pixel_avg2_w18, 6,7
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
+ movd xm2, [r2+32]
+%if mmsize == 32
+ pavgw m0, [r2+r4+ 0]
+ movd xm1, [r2+r4+32]
+ pavgw xm2, xm1
+%else
movu m1, [r2+16]
- movh m2, [r2+32]
movu m3, [r2+r4+ 0]
movu m4, [r2+r4+16]
- movh m5, [r2+r4+32]
+ movd m5, [r2+r4+32]
pavgw m0, m3
pavgw m1, m4
pavgw m2, m5
- mova [r0+ 0], m0
mova [r0+16], m1
- movh [r0+32], m2
- sub r5d, 1
+%endif
+ mova [r0+ 0], m0
+ movd [r0+32], xm2
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
+ dec r5d
jg .height_loop
- REP_RET
+ RET
+%endmacro
+
+INIT_XMM sse2
+PIXEL_AVG_W18
+INIT_YMM avx2
+PIXEL_AVG_W18
+
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
-; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
-; uint8_t *src1, int src_stride,
+; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src1, intptr_t src_stride,
; uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
INIT_MMX
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
%endmacro
AVG2_W16 12, movd
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
+ RET
+INIT_XMM
cglobal pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
- movdqu xmm0, [r2]
- movdqu xmm2, [r2+r3]
- movdqu xmm1, [r2+r4]
- movdqu xmm3, [r2+r6]
+ movu m0, [r2]
+ movu m2, [r2+r3]
+ movu m1, [r2+r4]
+ movu m3, [r2+r6]
lea r2, [r2+r3*2]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
- movdqa [r0], xmm0
- movdqa [r0+r1], xmm2
+ pavgb m0, m1
+ pavgb m2, m3
+ mova [r0], m0
+ mova [r0+r1], m2
lea r0, [r0+r1*2]
- sub r5d, 2
- jg .height_loop
- REP_RET
+ sub r5d, 2
+ jg .height_loop
+ RET
-%macro AVG2_W20 1
-cglobal pixel_avg2_w20_%1, 6,7
+cglobal pixel_avg2_w20_sse2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
- movdqu xmm0, [r4]
- movdqu xmm2, [r4+r3]
-%ifidn %1, sse2_misalign
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb xmm0, [r4+r2]
- pavgb xmm2, [r4+r6]
-%else
- movdqu xmm1, [r4+r2]
- movdqu xmm3, [r4+r6]
- movd mm4, [r4+16]
- movd mm5, [r4+r3+16]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
-%endif
- pavgb mm4, [r4+r2+16]
- pavgb mm5, [r4+r6+16]
+ movu m0, [r4]
+ movu m2, [r4+r3]
+ movu m1, [r4+r2]
+ movu m3, [r4+r6]
+ movd mm4, [r4+16]
+ movd mm5, [r4+r3+16]
+ pavgb m0, m1
+ pavgb m2, m3
+ pavgb mm4, [r4+r2+16]
+ pavgb mm5, [r4+r6+16]
lea r4, [r4+r3*2]
- movdqa [r0], xmm0
- movd [r0+16], mm4
- movdqa [r0+r1], xmm2
- movd [r0+r1+16], mm5
+ mova [r0], m0
+ mova [r0+r1], m2
+ movd [r0+16], mm4
+ movd [r0+r1+16], mm5
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+INIT_YMM avx2
+cglobal pixel_avg2_w20, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movu m0, [r4]
+ movu m1, [r4+r3]
+ pavgb m0, [r4+r2]
+ pavgb m1, [r4+r6]
+ lea r4, [r4+r3*2]
+ mova [r0], m0
+ mova [r0+r1], m1
lea r0, [r0+r1*2]
sub r5d, 2
jg .height_loop
- REP_RET
-%endmacro
-
-AVG2_W20 sse2
-AVG2_W20 sse2_misalign
+ RET
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
add r0, r1
dec r5d
jg .height_loop
- REP_RET
+ RET
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
%endif
%if 0 ; or %1==8 - but the extra branch seems too expensive
ja cachesplit
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
test r4b, 1
%else
test byte r4m, 1
INIT_MMX
AVG_CACHELINE_CHECK 8, 64, mmx2
AVG_CACHELINE_CHECK 12, 64, mmx2
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
AVG_CACHELINE_CHECK 16, 64, mmx2
AVG_CACHELINE_CHECK 20, 64, mmx2
AVG_CACHELINE_CHECK 8, 32, mmx2
jg avg_w16_align%1_%2_ssse3
ret
%if %1==0
- times 13 db 0x90 ; make sure the first ones don't end up short
+ ; make sure the first ones don't end up short
+ ALIGN 16
+ times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
%endif
%endmacro
and eax, 7
jz x264_pixel_avg2_w16_sse2
%endif
- PROLOGUE 6, 7
+ PROLOGUE 6, 8
lea r6, [r4+r2]
and r4, ~0xf
and r6, 0x1f
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
- lea r11, [avg_w16_addr]
- add r6, r11
+ lea r7, [avg_w16_addr]
+ add r6, r7
%else
lea r6, [avg_w16_addr + r6]
%endif
-%ifdef UNIX64
- jmp r6
-%else
- call r6
- RET
-%endif
+ TAIL_CALL r6, 1
%assign j 0
%assign k 1
movu m1, [r2+%4*mmsize]
movu m2, [r2+r3+%3*mmsize]
movu m3, [r2+r3+%4*mmsize]
- movu m4, [r2+r3*2+%3*mmsize]
- movu m5, [r2+r3*2+%4*mmsize]
- movu m6, [r2+%2+%3*mmsize]
- movu m7, [r2+%2+%4*mmsize]
mova [r0+%3*mmsize], m0
mova [r0+%4*mmsize], m1
mova [r0+r1+%3*mmsize], m2
mova [r0+r1+%4*mmsize], m3
- mova [r0+r1*2+%3*mmsize], m4
- mova [r0+r1*2+%4*mmsize], m5
- mova [r0+%1+%3*mmsize], m6
- mova [r0+%1+%4*mmsize], m7
+ movu m0, [r2+r3*2+%3*mmsize]
+ movu m1, [r2+r3*2+%4*mmsize]
+ movu m2, [r2+%2+%3*mmsize]
+ movu m3, [r2+%2+%4*mmsize]
+ mova [r0+r1*2+%3*mmsize], m0
+ mova [r0+r1*2+%4*mmsize], m1
+ mova [r0+%1+%3*mmsize], m2
+ mova [r0+%1+%4*mmsize], m3
%endmacro
%macro COPY4 2
%endmacro
;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
+; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
+; uint8_t *src, intptr_t i_src_stride, int i_height )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal mc_copy_w4_mmx, 4,6
lea r5, [r3*3]
lea r4, [r1*3]
je .end
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%define mova movd
%define movu movd
%endif
%macro MC_COPY 1
%assign %%w %1*SIZEOF_PIXEL/mmsize
%if %%w > 0
-cglobal mc_copy_w%1, 5,7,8*(%%w/2)
+cglobal mc_copy_w%1, 5,7
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
lea r0, [r0+r1*4]
sub r4d, 4
jg .height_loop
- REP_RET
+ RET
%endif
%endmacro
INIT_MMX mmx
MC_COPY 8
MC_COPY 16
-INIT_XMM sse2
+INIT_XMM sse
MC_COPY 8
MC_COPY 16
-INIT_XMM aligned, sse2
+INIT_XMM aligned, sse
MC_COPY 16
-
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+MC_COPY 16
+INIT_YMM aligned, avx
+MC_COPY 16
+%endif
;=============================================================================
; prefetch
; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
-; void prefetch_fenc( pixel *pix_y, int stride_y,
-; pixel *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
+; pixel *pix_uv, intptr_t stride_uv, int mb_x )
;-----------------------------------------------------------------------------
-INIT_MMX
-%ifdef ARCH_X86_64
-cglobal prefetch_fenc_mmx2, 5,5
- FIX_STRIDES r1d, r3d
+
+%macro PREFETCH_FENC 1
+%if ARCH_X86_64
+cglobal prefetch_fenc_%1, 5,5
+ FIX_STRIDES r1, r3
and r4d, 3
mov eax, r4d
imul r4d, r1d
lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
prefetcht0 [r2]
prefetcht0 [r2+r3]
+%ifidn %1, 422
+ lea r2, [r2+r3*2]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+%endif
RET
%else
-cglobal prefetch_fenc_mmx2, 0,3
+cglobal prefetch_fenc_%1, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
+%ifidn %1, 422
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+%endif
ret
%endif ; ARCH_X86_64
+%endmacro
+
+INIT_MMX mmx2
+PREFETCH_FENC 420
+PREFETCH_FENC 422
;-----------------------------------------------------------------------------
-; void prefetch_ref( pixel *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
;-----------------------------------------------------------------------------
-cglobal prefetch_ref_mmx2, 3,3
- FIX_STRIDES r1d
+INIT_MMX mmx2
+cglobal prefetch_ref, 3,3
+ FIX_STRIDES r1
dec r2d
and r2d, r1d
lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
; chroma MC
;=============================================================================
-%ifdef ARCH_X86_64
- DECLARE_REG_TMP 10,11,6
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6,7,8
%else
DECLARE_REG_TMP 0,1,2
%endif
-%macro MC_CHROMA_START 0
+%macro MC_CHROMA_START 1
+%if ARCH_X86_64
+ PROLOGUE 0,9,%1
+%else
+ PROLOGUE 0,6,%1
+%endif
movifnidn r3, r3mp
movifnidn r4d, r4m
movifnidn r5d, r5m
- movifnidn t2d, r6m
- mov t0d, t2d
+ movifnidn t0d, r6m
+ mov t2d, t0d
mov t1d, r5d
sar t0d, 3
sar t1d, 3
add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 4
movu %1, [%4+0]
movu %2, [%4+4]
punpcklwd %1, %3
punpckhwd %2, %3
%else
- shufps %2, %1, %3, 11011101b
- shufps %1, %3, 10001000b
+ shufps %2, %1, %3, q3131
+ shufps %1, %3, q2020
%endif
%endmacro
%else ; !HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED 3
-%if mmsize == 8 || cpuflag(misalign)
+%if mmsize == 8
punpcklwd %1, %3
%else
movh %2, %3
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
-; uint8_t *src, int src_stride,
+; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
; int dx, int dy,
; int width, int height )
;-----------------------------------------------------------------------------
%macro MC_CHROMA 0
-cglobal mc_chroma, 0,6
- MC_CHROMA_START
+cglobal mc_chroma
+ MC_CHROMA_START 0
FIX_STRIDES r4
and r5d, 7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
jz .mc1dy
%endif
and t2d, 7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
jz .mc1dx
%endif
shl r5d, 16
pxor m6, m6
punpcklbw m5, m6
%if mmsize==8
- pshufw m7, m5, 0xee
- pshufw m6, m5, 0x00
- pshufw m5, m5, 0x55
+ pshufw m7, m5, q3232
+ pshufw m6, m5, q0000
+ pshufw m5, m5, q1111
jge .width4
%else
-%ifdef WIN64
+%if WIN64
cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
%endif
- pshufd m7, m5, 0x55
+ pshufd m7, m5, q1111
punpcklwd m5, m5
- pshufd m6, m5, 0x00
- pshufd m5, m5, 0x55
+ pshufd m6, m5, q0000
+ pshufd m5, m5, q1111
jg .width8
%endif
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
add r2, r2
UNPACK_UNALIGNED m0, m1, m2, r3
%else
SWAP 3, 0
ALIGN 4
.loop2:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
UNPACK_UNALIGNED m0, m1, m2, r3+r4
pmullw m3, m6
%else ; !HIGH_BIT_DEPTH
pmullw m0, m5
paddw m0, m2
psrlw m0, 6
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movh [r0], m0
%if mmsize == 8
psrlq m0, 32
add r1, r2
dec r5d
jg .loop2
- REP_RET
+ RET
%if mmsize==8
.width4:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mov t0, r0
mov t1, r1
mov t2, r3
+%if WIN64
+ %define multy0 r4m
+%else
%define multy0 [rsp-8]
+%endif
mova multy0, m5
%else
mov r3m, r3
%endif
%else
.width8:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define multy0 m8
SWAP 8, 5
%else
%endif
FIX_STRIDES r2
.loopx:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
UNPACK_UNALIGNED m0, m2, m4, r3
UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
%else
add r3, r4
ALIGN 4
.loop4:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
UNPACK_UNALIGNED m0, m1, m2, r3
pmaddwd m0, m7
pmaddwd m1, m7
paddw m1, m3
psrlw m0, 6
psrlw m1, 6
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
movh [r0], m0
movh [r0+mmsize/2], m1
%if mmsize==8
%else ; !HIGH_BIT_DEPTH
packuswb m0, m1
%if mmsize==8
- pshufw m1, m0, 0x8
- pshufw m0, m0, 0xd
+ pshufw m1, m0, q0020
+ pshufw m0, m0, q0031
movd [r0], m1
movd [r1], m0
%else
- pshufd m0, m0, 0xd8
+ pshufd m0, m0, q3120
movq [r0], m0
movhps [r1], m0
%endif
dec r5d
jg .loop4
%if mmsize!=8
- REP_RET
+ RET
%else
sub dword r7m, 4
jg .width8
- REP_RET
+ RET
.width8:
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
lea r3, [t2+8*SIZEOF_PIXEL]
lea r0, [t0+4*SIZEOF_PIXEL]
lea r1, [t1+4*SIZEOF_PIXEL]
jmp .loopx
%endif
-%ifdef ARCH_X86_64 ; too many regs for x86_32
+%if ARCH_X86_64 ; too many regs for x86_32
RESET_MM_PERMUTATION
-%ifdef WIN64
-%if xmm_regs_used > 6
- %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
- %assign xmm_regs_used 6
-%endif
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
%endif
.mc1dy:
and t2d, 7
movd m5, r5d
mov r6d, 2*SIZEOF_PIXEL
.mc1d:
-%ifdef HIGH_BIT_DEPTH
-%if mmsize == 16
+%if HIGH_BIT_DEPTH && mmsize == 16
WIN64_SPILL_XMM 8
-%endif
%endif
mova m4, [pw_8]
SPLATW m5, m5
movifnidn r5d, r8m
cmp dword r7m, 4
jg .mc1d_w8
- mov r10, r2
- mov r11, r4
+ mov r7, r2
+ mov r8, r4
%if mmsize!=8
shr r5d, 1
%endif
.loop1d_w4:
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if mmsize == 8
movq m0, [r3+0]
movq m2, [r3+8]
%else
movu m0, [r3]
movu m1, [r3+r6]
- add r3, r11
+ add r3, r8
movu m2, [r3]
movu m3, [r3+r6]
%endif
movq m0, [r3]
movq m1, [r3+r6]
%if mmsize!=8
- add r3, r11
+ add r3, r8
movhps m0, [r3]
movhps m1, [r3+r6]
%endif
paddw m2, m3
psrlw m0, 3
psrlw m2, 3
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
%if mmsize == 8
- xchg r4, r11
- xchg r2, r10
+ xchg r4, r8
+ xchg r2, r7
%endif
movq [r0], m0
movq [r1], m2
%if mmsize == 16
- add r0, r10
- add r1, r10
+ add r0, r7
+ add r1, r7
movhps [r0], m0
movhps [r1], m2
%endif
%else ; !HIGH_BIT_DEPTH
packuswb m0, m2
%if mmsize==8
- xchg r4, r11
- xchg r2, r10
+ xchg r4, r8
+ xchg r2, r7
movd [r0], m0
psrlq m0, 32
movd [r1], m0
movhlps m1, m0
movd [r0], m0
movd [r1], m1
- add r0, r10
- add r1, r10
+ add r0, r7
+ add r1, r7
psrldq m0, 4
psrldq m1, 4
movd [r0], m0
add r1, r2
dec r5d
jg .loop1d_w4
- REP_RET
+ RET
.mc1d_w8:
sub r2, 4*SIZEOF_PIXEL
sub r4, 8*SIZEOF_PIXEL
- mov r10, 4*SIZEOF_PIXEL
- mov r11, 8*SIZEOF_PIXEL
+ mov r7, 4*SIZEOF_PIXEL
+ mov r8, 8*SIZEOF_PIXEL
%if mmsize==8
shl r5d, 1
%endif
%endif ; ARCH_X86_64
%endmacro ; MC_CHROMA
-
%macro MC_CHROMA_SSSE3 0
-cglobal mc_chroma, 0,6,9
- MC_CHROMA_START
+cglobal mc_chroma
+ MC_CHROMA_START 10-cpuflag(avx2)
and r5d, 7
and t2d, 7
mov t0d, r5d
sub r5d, t2d
imul t2d, t0d ; (x*255+8)*y
imul r5d, t0d ; (x*255+8)*(8-y)
- movd m6, t2d
- movd m7, r5d
+ movd xm6, t2d
+ movd xm7, r5d
%if cpuflag(cache64)
mov t0d, r3d
and t0d, 7
%ifdef PIC
lea t1, [ch_shuf_adj]
- movddup m5, [t1 + t0*4]
+ movddup xm5, [t1 + t0*4]
%else
- movddup m5, [ch_shuf_adj + t0*4]
+ movddup xm5, [ch_shuf_adj + t0*4]
%endif
- paddb m5, [ch_shuf]
+ paddb xm5, [ch_shuf]
and r3, ~7
%else
mova m5, [ch_shuf]
movifnidn r1, r1mp
movifnidn r2d, r2m
movifnidn r5d, r8m
+%if cpuflag(avx2)
+ vpbroadcastw m6, xm6
+ vpbroadcastw m7, xm7
+%else
SPLATW m6, m6
SPLATW m7, m7
+%endif
+%if ARCH_X86_64
+ %define shiftround m8
+ mova m8, [pw_512]
+%else
+ %define shiftround [pw_512]
+%endif
cmp dword r7m, 4
jg .width8
+
+%if cpuflag(avx2)
+.loop4:
+ movu xm0, [r3]
+ movu xm1, [r3+r4]
+ vinserti128 m0, m0, [r3+r4], 1
+ vinserti128 m1, m1, [r3+r4*2], 1
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ paddw m0, m1
+ pmulhrsw m0, shiftround
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r0], xm0
+ movd [r0+r2], xm1
+ psrldq xm0, 4
+ psrldq xm1, 4
+ movd [r1], xm0
+ movd [r1+r2], xm1
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop4
+ RET
+.width8:
+ movu xm0, [r3]
+ vinserti128 m0, m0, [r3+8], 1
+ pshufb m0, m5
+.loop8:
+ movu xm3, [r3+r4]
+ vinserti128 m3, m3, [r3+r4+8], 1
+ pshufb m3, m5
+ pmaddubsw m1, m0, m7
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m3, m7
+
+ movu xm0, [r3+r4*2]
+ vinserti128 m0, m0, [r3+r4*2+8], 1
+ pshufb m0, m5
+ pmaddubsw m4, m0, m6
+
+ paddw m1, m2
+ paddw m3, m4
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
+ packuswb m1, m3
+ mova m2, [deinterleave_shufd]
+ vpermd m1, m2, m1
+ vextracti128 xm2, m1, 1
+ movq [r0], xm1
+ movhps [r1], xm1
+ movq [r0+r2], xm2
+ movhps [r1+r2], xm2
+%else
movu m0, [r3]
pshufb m0, m5
.loop4:
pmaddubsw m2, m1, m7
pmaddubsw m1, m6
pmaddubsw m3, m6
- paddw m0, [pw_32]
- paddw m2, [pw_32]
paddw m1, m0
paddw m3, m2
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
mova m0, m4
- psrlw m1, 6
- psrlw m3, 6
packuswb m1, m3
- movhlps m3, m1
movd [r0], m1
+%if cpuflag(sse4)
+ pextrd [r1], m1, 1
+ pextrd [r0+r2], m1, 2
+ pextrd [r1+r2], m1, 3
+%else
+ movhlps m3, m1
movd [r0+r2], m3
psrldq m1, 4
psrldq m3, 4
movd [r1], m1
movd [r1+r2], m3
+%endif
lea r3, [r3+r4*2]
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
sub r5d, 2
jg .loop4
- REP_RET
-
+ RET
.width8:
movu m0, [r3]
pshufb m0, m5
movu m1, [r3+8]
pshufb m1, m5
-%ifdef ARCH_X86_64
- SWAP 8, 6
- %define mult1 m8
+%if ARCH_X86_64
+ SWAP 9, 6
+ %define mult1 m9
%else
mova r0m, m6
%define mult1 r0m
pmaddubsw m1, m7
pmaddubsw m2, mult1
pmaddubsw m3, mult1
- paddw m0, [pw_32]
- paddw m1, [pw_32]
paddw m0, m2
paddw m1, m3
- psrlw m0, 6
- psrlw m1, 6
+ pmulhrsw m0, shiftround ; x + 32 >> 6
+ pmulhrsw m1, shiftround
packuswb m0, m1
- pshufd m0, m0, 0xd8
+ pshufd m0, m0, q3120
movq [r0], m0
movhps [r1], m0
pmaddubsw m6, m7
pmaddubsw m2, mult1
pmaddubsw m3, mult1
- paddw m4, [pw_32]
- paddw m6, [pw_32]
paddw m2, m4
paddw m3, m6
- psrlw m2, 6
- psrlw m3, 6
+ pmulhrsw m2, shiftround
+ pmulhrsw m3, shiftround
packuswb m2, m3
- pshufd m2, m2, 0xd8
+ pshufd m2, m2, q3120
movq [r0+r2], m2
movhps [r1+r2], m2
+%endif
lea r3, [r3+r4*2]
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
sub r5d, 2
jg .loop8
- REP_RET
+ RET
%endmacro
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
INIT_XMM sse2
%else ; !HIGH_BIT_DEPTH
INIT_MMX mmx2
MC_CHROMA
-INIT_XMM sse2, misalign
-MC_CHROMA
INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3
MC_CHROMA_SSSE3
INIT_XMM avx
MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
+INIT_YMM avx2
+MC_CHROMA_SSSE3
%endif ; HIGH_BIT_DEPTH