;*****************************************************************************
;* mc-a.asm: x86 motion compensation
;*****************************************************************************
-;* Copyright (C) 2003-2010 x264 project
+;* Copyright (C) 2003-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
%endif
%macro AVG_END 0
- sub eax, 2
lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
+ sub eax, 2
jg .height_loop
REP_RET
%endmacro
%endmacro
%ifdef HIGH_BIT_DEPTH
-
%macro BIWEIGHT_ROW 4
BIWEIGHT [%2], [%3]
%if %4==mmsize/4
;-----------------------------------------------------------------------------
; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
;-----------------------------------------------------------------------------
-%macro AVG_WEIGHT 2-3 0
-cglobal pixel_avg_weight_w%2_%1
+%macro AVG_WEIGHT 1-2 0
+cglobal pixel_avg_weight_w%1
BIWEIGHT_START
- AVG_START %3
+ AVG_START %2
%ifdef HIGH_BIT_DEPTH
mova m7, [pw_pixel_max]
%endif
.height_loop:
-%if mmsize==16 && %2==mmsize/(2*SIZEOF_PIXEL)
+%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
BIWEIGHT [t2], [t4]
SWAP 0, 6
BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
movhps [t0+SIZEOF_PIXEL*t1], m6
%else
%assign x 0
-%rep (%2*SIZEOF_PIXEL+mmsize-1)/mmsize
- BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
- BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %2
+%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
+ BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
+ BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
%assign x x+mmsize
%endrep
%endif
%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
-INIT_MMX
-AVG_WEIGHT mmxext, 4
-AVG_WEIGHT mmxext, 8
-AVG_WEIGHT mmxext, 16
+INIT_MMX mmx2
+AVG_WEIGHT 4
+AVG_WEIGHT 8
+AVG_WEIGHT 16
%ifdef HIGH_BIT_DEPTH
-INIT_XMM
-AVG_WEIGHT sse2, 4, 8
-AVG_WEIGHT sse2, 8, 8
-AVG_WEIGHT sse2, 16, 8
+INIT_XMM sse2
+AVG_WEIGHT 4, 8
+AVG_WEIGHT 8, 8
+AVG_WEIGHT 16, 8
%else ;!HIGH_BIT_DEPTH
-INIT_XMM
-AVG_WEIGHT sse2, 8, 7
-AVG_WEIGHT sse2, 16, 7
+INIT_XMM sse2
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
-INIT_MMX
-AVG_WEIGHT ssse3, 4
-INIT_XMM
-AVG_WEIGHT ssse3, 8, 7
-AVG_WEIGHT ssse3, 16, 7
+INIT_MMX ssse3
+AVG_WEIGHT 4
+INIT_XMM ssse3
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
%endif ;HIGH_BIT_DEPTH
;=============================================================================
movhps [%2+r1+x], m5
%else
WEIGHT %1+x, %1+x+mmsize/2
- SWAP m5, m7
+ SWAP 5, 7
WEIGHT %1+r3+x, %1+r3+x+mmsize/2
CLIPW m5, [pb_0], m4
CLIPW m7, [pb_0], m4
%assign XMMREGS 8
%endif
-%macro WEIGHTER 2
- cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
+%macro WEIGHTER 1
+ cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
REP_RET
%endmacro
-INIT_MMX
-WEIGHTER 4, mmxext
-WEIGHTER 8, mmxext
-WEIGHTER 12, mmxext
-WEIGHTER 16, mmxext
-WEIGHTER 20, mmxext
-INIT_XMM
-WEIGHTER 8, sse2
-WEIGHTER 16, sse2
-WEIGHTER 20, sse2
+INIT_MMX mmx2
+WEIGHTER 4
+WEIGHTER 8
+WEIGHTER 12
+WEIGHTER 16
+WEIGHTER 20
+INIT_XMM sse2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
%ifdef HIGH_BIT_DEPTH
-WEIGHTER 12, sse2
+WEIGHTER 12
+INIT_XMM avx
+WEIGHTER 8
+WEIGHTER 12
+WEIGHTER 16
+WEIGHTER 20
%else
%define WEIGHT WEIGHT_SSSE3
%define WEIGHT_START WEIGHT_START_SSSE3
-INIT_MMX
-WEIGHTER 4, ssse3
-INIT_XMM
-WEIGHTER 8, ssse3
-WEIGHTER 16, ssse3
-WEIGHTER 20, ssse3
+INIT_MMX ssse3
+WEIGHTER 4
+INIT_XMM ssse3
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+INIT_XMM avx
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
%endif
%macro OFFSET_OP 7
;-----------------------------------------------------------------------------
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
-%macro OFFSET 3
- cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
+%macro OFFSET 2
+ cglobal mc_offset%2_w%1, NUMREGS, NUMREGS
FIX_STRIDES r1, r3
mova m2, [r4]
%ifdef HIGH_BIT_DEPTH
-%ifidn %3,add
+%ifidn %2,add
mova m3, [pw_pixel_max]
%endif
%endif
LOAD_HEIGHT
.loop:
- OFFSET_TWO_ROW r2, r0, %1, %3
+ OFFSET_TWO_ROW r2, r0, %1, %2
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
sub HEIGHT_REG, 2
REP_RET
%endmacro
-%macro OFFSETPN 2
- OFFSET %1, %2, add
- OFFSET %1, %2, sub
+%macro OFFSETPN 1
+ OFFSET %1, add
+ OFFSET %1, sub
%endmacro
-INIT_MMX
-OFFSETPN 4, mmxext
-OFFSETPN 8, mmxext
-OFFSETPN 12, mmxext
-OFFSETPN 16, mmxext
-OFFSETPN 20, mmxext
-INIT_XMM
-OFFSETPN 12, sse2
-OFFSETPN 16, sse2
-OFFSETPN 20, sse2
+INIT_MMX mmx2
+OFFSETPN 4
+OFFSETPN 8
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+INIT_XMM sse2
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+INIT_XMM avx
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
%ifdef HIGH_BIT_DEPTH
-OFFSETPN 8, sse2
+INIT_XMM sse2
+OFFSETPN 8
+INIT_XMM avx
+OFFSETPN 8
%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
; void pixel_avg_4x4( pixel *dst, int dst_stride,
; pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
-%macro AVGH 3
-cglobal pixel_avg_%1x%2_%3
+%macro AVGH 2
+cglobal pixel_avg_%1x%2
mov eax, %2
cmp dword r6m, 32
- jne pixel_avg_weight_w%1_%3
+ jne pixel_avg_weight_w%1 %+ SUFFIX
%if mmsize == 16 && %1 == 16
test dword r4m, 15
jz pixel_avg_w%1_sse2
%endif
- jmp pixel_avg_w%1_mmxext
+ jmp pixel_avg_w%1_mmx2
%endmacro
;-----------------------------------------------------------------------------
; int height, int weight );
;-----------------------------------------------------------------------------
-%macro AVG_FUNC 4
-cglobal pixel_avg_w%1_%4
+%macro AVG_FUNC 3
+cglobal pixel_avg_w%1
AVG_START
.height_loop:
%assign x 0
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-AVG_FUNC 4, movq, movq, mmxext
-AVGH 4, 8, mmxext
-AVGH 4, 4, mmxext
-AVGH 4, 2, mmxext
-
-AVG_FUNC 8, movq, movq, mmxext
-AVGH 8, 16, mmxext
-AVGH 8, 8, mmxext
-AVGH 8, 4, mmxext
-
-AVG_FUNC 16, movq, movq, mmxext
-AVGH 16, 16, mmxext
-AVGH 16, 8, mmxext
-
-INIT_XMM
-
-AVG_FUNC 4, movq, movq, sse2
-AVGH 4, 8, sse2
-AVGH 4, 4, sse2
-AVGH 4, 2, sse2
-
-AVG_FUNC 8, movdqu, movdqa, sse2
-AVGH 8, 16, sse2
-AVGH 8, 8, sse2
-AVGH 8, 4, sse2
-
-AVG_FUNC 16, movdqu, movdqa, sse2
-AVGH 16, 16, sse2
-AVGH 16, 8, sse2
+INIT_MMX mmx2
+AVG_FUNC 4, movq, movq
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movq, movq
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movq, movq
+AVGH 16, 16
+AVGH 16, 8
+
+INIT_XMM sse2
+AVG_FUNC 4, movq, movq
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movdqu, movdqa
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
%else ;!HIGH_BIT_DEPTH
-INIT_MMX
-AVG_FUNC 4, movd, movd, mmxext
-AVGH 4, 8, mmxext
-AVGH 4, 4, mmxext
-AVGH 4, 2, mmxext
-
-AVG_FUNC 8, movq, movq, mmxext
-AVGH 8, 16, mmxext
-AVGH 8, 8, mmxext
-AVGH 8, 4, mmxext
-
-AVG_FUNC 16, movq, movq, mmxext
-AVGH 16, 16, mmxext
-AVGH 16, 8, mmxext
-
-INIT_XMM
-AVG_FUNC 16, movdqu, movdqa, sse2
-AVGH 16, 16, sse2
-AVGH 16, 8, sse2
-AVGH 8, 16, sse2
-AVGH 8, 8, sse2
-AVGH 8, 4, sse2
-AVGH 16, 16, ssse3
-AVGH 16, 8, ssse3
-AVGH 8, 16, ssse3
-AVGH 8, 8, ssse3
-AVGH 8, 4, ssse3
-INIT_MMX
-AVGH 4, 8, ssse3
-AVGH 4, 4, ssse3
-AVGH 4, 2, ssse3
+INIT_MMX mmx2
+AVG_FUNC 4, movd, movd
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movq, movq
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movq, movq
+AVGH 16, 16
+AVGH 16, 8
+
+INIT_XMM sse2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+INIT_XMM ssse3
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+INIT_MMX ssse3
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
%endif ;HIGH_BIT_DEPTH
+
;=============================================================================
; pixel avg2
;=============================================================================
; uint16_t *src1, int src_stride,
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
-%macro AVG2_W_ONE 2
-cglobal pixel_avg2_w%1_%2, 6,7,4*(mmsize/16)
+%macro AVG2_W_ONE 1
+cglobal pixel_avg2_w%1, 6,7,4
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
%endif
mova [r0], m0
mova [r0+r1*2], m1
- sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
+ sub r5d, 2
jg .height_loop
REP_RET
%endmacro
-%macro AVG2_W_TWO 4
-cglobal pixel_avg2_w%1_%4, 6,7,8*(mmsize/16)
+%macro AVG2_W_TWO 3
+cglobal pixel_avg2_w%1, 6,7,8
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
%3 [r0+mmsize], m1
mova [r0+r1*2], m2
%3 [r0+r1*2+mmsize], m3
- sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
+ sub r5d, 2
jg .height_loop
REP_RET
%endmacro
-INIT_MMX
-AVG2_W_ONE 4, mmxext
-AVG2_W_TWO 8, movu, mova, mmxext
-INIT_XMM
-AVG2_W_ONE 8, sse2
-AVG2_W_TWO 10, movd, movd, sse2
-AVG2_W_TWO 16, movu, mova, sse2
+INIT_MMX mmx2
+AVG2_W_ONE 4
+AVG2_W_TWO 8, movu, mova
+INIT_XMM sse2
+AVG2_W_ONE 8
+AVG2_W_TWO 10, movd, movd
+AVG2_W_TWO 16, movu, mova
INIT_MMX
-cglobal pixel_avg2_w10_mmxext, 6,7
+cglobal pixel_avg2_w10_mmx2, 6,7
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
mova [r0+r1*2+ 0], m3
mova [r0+r1*2+ 8], m4
movh [r0+r1*2+16], m5
- sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
+ sub r5d, 2
jg .height_loop
REP_RET
-cglobal pixel_avg2_w16_mmxext, 6,7
+cglobal pixel_avg2_w16_mmx2, 6,7
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
mova [r0+r1*2+ 8], m5
mova [r0+r1*2+16], m6
mova [r0+r1*2+24], m7
- sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
+ sub r5d, 2
jg .height_loop
REP_RET
-cglobal pixel_avg2_w18_mmxext, 6,7
+cglobal pixel_avg2_w18_mmx2, 6,7
sub r4, r2
.height_loop:
movu m0, [r2+ 0]
mova [r0+16], m2
mova [r0+24], m3
movh [r0+32], m4
- sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
+ dec r5d
jg .height_loop
REP_RET
mova [r0+ 0], m0
mova [r0+16], m1
movh [r0+32], m2
- sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
+ dec r5d
jg .height_loop
REP_RET
%endif ; HIGH_BIT_DEPTH
; uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
-cglobal pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmx2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
REP_RET
%endmacro
+INIT_MMX
AVG2_W8 4, movd
AVG2_W8 8, movq
%macro AVG2_W16 2
-cglobal pixel_avg2_w%1_mmxext, 6,7
+cglobal pixel_avg2_w%1_mmx2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
AVG2_W16 12, movd
AVG2_W16 16, movq
-cglobal pixel_avg2_w20_mmxext, 6,7
+cglobal pixel_avg2_w20_mmx2, 6,7
sub r2, r4
lea r6, [r2+r3]
.height_loop:
%endmacro
%macro AVG_CACHELINE_FUNC 2
-pixel_avg2_w%1_cache_mmxext:
+pixel_avg2_w%1_cache_mmx2:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
%if %1>8
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
%if %1 == 12
;w12 isn't needed because w16 is just as fast if there's no cacheline split
-%define cachesplit pixel_avg2_w16_cache_mmxext
+%define cachesplit pixel_avg2_w16_cache_mmx2
%else
-%define cachesplit pixel_avg2_w%1_cache_mmxext
+%define cachesplit pixel_avg2_w%1_cache_mmx2
%endif
cglobal pixel_avg2_w%1_cache%2_%3
mov eax, r2m
- and eax, 0x1f|(%2>>1)
- cmp eax, (32-%1-(%1 % 8))|(%2>>1)
+ and eax, %2-1
+ cmp eax, (%2-%1-(%1 % 8))
%if %1==12||%1==20
jbe pixel_avg2_w%1_%3
%else
jz pixel_avg2_w%1_%3
mov eax, r2m
%endif
-%ifidn %3, sse2
- AVG_CACHELINE_FUNC %1, %2
-%elif %1==8 && %2==64
+%if mmsize==16 || (%1==8 && %2==64)
AVG_CACHELINE_FUNC %1, %2
%else
jmp cachesplit
%endif
%endmacro
-AVG_CACHELINE_CHECK 8, 64, mmxext
-AVG_CACHELINE_CHECK 12, 64, mmxext
+INIT_MMX
+AVG_CACHELINE_CHECK 8, 64, mmx2
+AVG_CACHELINE_CHECK 12, 64, mmx2
%ifndef ARCH_X86_64
-AVG_CACHELINE_CHECK 16, 64, mmxext
-AVG_CACHELINE_CHECK 20, 64, mmxext
-AVG_CACHELINE_CHECK 8, 32, mmxext
-AVG_CACHELINE_CHECK 12, 32, mmxext
-AVG_CACHELINE_CHECK 16, 32, mmxext
-AVG_CACHELINE_CHECK 20, 32, mmxext
+AVG_CACHELINE_CHECK 16, 64, mmx2
+AVG_CACHELINE_CHECK 20, 64, mmx2
+AVG_CACHELINE_CHECK 8, 32, mmx2
+AVG_CACHELINE_CHECK 12, 32, mmx2
+AVG_CACHELINE_CHECK 16, 32, mmx2
+AVG_CACHELINE_CHECK 20, 32, mmx2
%endif
+INIT_XMM
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
; pixel copy
;=============================================================================
-%macro COPY4 4
- %2 m0, [r2]
- %2 m1, [r2+r3]
- %2 m2, [r2+r3*2]
- %2 m3, [r2+%4]
- %1 [r0], m0
- %1 [r0+r1], m1
- %1 [r0+r1*2], m2
- %1 [r0+%3], m3
+%macro COPY1 2
+ movu m0, [r2]
+ movu m1, [r2+r3]
+ movu m2, [r2+r3*2]
+ movu m3, [r2+%2]
+ mova [r0], m0
+ mova [r0+r1], m1
+ mova [r0+r1*2], m2
+ mova [r0+%1], m3
%endmacro
-%ifdef HIGH_BIT_DEPTH
-%macro COPY_ONE 6
- COPY4 %1, %2, %3, %4
+%macro COPY2 2-4 0, 1
+ movu m0, [r2+%3*mmsize]
+ movu m1, [r2+%4*mmsize]
+ movu m2, [r2+r3+%3*mmsize]
+ movu m3, [r2+r3+%4*mmsize]
+ movu m4, [r2+r3*2+%3*mmsize]
+ movu m5, [r2+r3*2+%4*mmsize]
+ movu m6, [r2+%2+%3*mmsize]
+ movu m7, [r2+%2+%4*mmsize]
+ mova [r0+%3*mmsize], m0
+ mova [r0+%4*mmsize], m1
+ mova [r0+r1+%3*mmsize], m2
+ mova [r0+r1+%4*mmsize], m3
+ mova [r0+r1*2+%3*mmsize], m4
+ mova [r0+r1*2+%4*mmsize], m5
+ mova [r0+%1+%3*mmsize], m6
+ mova [r0+%1+%4*mmsize], m7
%endmacro
-%macro COPY_TWO 6
- %2 m0, [r2+%5]
- %2 m1, [r2+%6]
- %2 m2, [r2+r3+%5]
- %2 m3, [r2+r3+%6]
- %2 m4, [r2+r3*2+%5]
- %2 m5, [r2+r3*2+%6]
- %2 m6, [r2+%4+%5]
- %2 m7, [r2+%4+%6]
- %1 [r0+%5], m0
- %1 [r0+%6], m1
- %1 [r0+r1+%5], m2
- %1 [r0+r1+%6], m3
- %1 [r0+r1*2+%5], m4
- %1 [r0+r1*2+%6], m5
- %1 [r0+%3+%5], m6
- %1 [r0+%3+%6], m7
+%macro COPY4 2
+ COPY2 %1, %2, 0, 1
+ COPY2 %1, %2, 2, 3
%endmacro
+;-----------------------------------------------------------------------------
+; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
+; uint8_t *src, int i_src_stride, int i_height )
+;-----------------------------------------------------------------------------
INIT_MMX
cglobal mc_copy_w4_mmx, 4,6
FIX_STRIDES r1, r3
lea r5, [r3*3]
lea r4, [r1*3]
je .end
- COPY4 mova, mova, r4, r5
+%ifndef HIGH_BIT_DEPTH
+ %define mova movd
+ %define movu movd
+%endif
+ COPY1 r4, r5
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
-.end
- COPY4 movu, mova, r4, r5
+.end:
+ COPY1 r4, r5
RET
-cglobal mc_copy_w16_mmx, 5,7
+%macro MC_COPY 1
+%assign %%w %1*SIZEOF_PIXEL/mmsize
+%if %%w > 0
+cglobal mc_copy_w%1, 5,7,8*(%%w/2)
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- COPY_TWO mova, movu, r5, r6, mmsize*0, mmsize*1
- COPY_TWO mova, movu, r5, r6, mmsize*2, mmsize*3
- sub r4d, 4
+ COPY %+ %%w r5, r6
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
- jg .height_loop
- REP_RET
-
-%macro MC_COPY 5
-cglobal mc_copy_w%2_%4, 5,7,%5
- FIX_STRIDES r1, r3
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- COPY_%1 mova, %3, r5, r6, 0, mmsize
sub r4d, 4
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
jg .height_loop
REP_RET
+%endif
%endmacro
-MC_COPY TWO, 8, movu, mmx, 0
-INIT_XMM
-MC_COPY ONE, 8, movu, sse2, 0
-MC_COPY TWO, 16, movu, sse2, 8
-MC_COPY TWO, 16, mova, aligned_sse2, 8
-%endif ; HIGH_BIT_DEPTH
-
-%ifndef HIGH_BIT_DEPTH
-INIT_MMX
-;-----------------------------------------------------------------------------
-; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
-; uint8_t *src, int i_src_stride, int i_height )
-;-----------------------------------------------------------------------------
-cglobal mc_copy_w4_mmx, 4,6
- cmp dword r4m, 4
- lea r5, [r3*3]
- lea r4, [r1*3]
- je .end
- COPY4 movd, movd, r4, r5
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
-.end:
- COPY4 movd, movd, r4, r5
- RET
-
-cglobal mc_copy_w8_mmx, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- COPY4 movq, movq, r5, r6
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-
-cglobal mc_copy_w16_mmx, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- movq mm0, [r2]
- movq mm1, [r2+8]
- movq mm2, [r2+r3]
- movq mm3, [r2+r3+8]
- movq mm4, [r2+r3*2]
- movq mm5, [r2+r3*2+8]
- movq mm6, [r2+r6]
- movq mm7, [r2+r6+8]
- movq [r0], mm0
- movq [r0+8], mm1
- movq [r0+r1], mm2
- movq [r0+r1+8], mm3
- movq [r0+r1*2], mm4
- movq [r0+r1*2+8], mm5
- movq [r0+r5], mm6
- movq [r0+r5+8], mm7
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-
-INIT_XMM
-%macro COPY_W16_SSE2 2
-cglobal %1, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- COPY4 movdqa, %2, r5, r6
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-%endmacro
-
-COPY_W16_SSE2 mc_copy_w16_sse2, movdqu
-; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
-; but with SSE3 the overhead is zero, so there's no reason not to include it.
-COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
-COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
-%endif ; !HIGH_BIT_DEPTH
+INIT_MMX mmx
+MC_COPY 8
+MC_COPY 16
+INIT_XMM sse2
+MC_COPY 8
+MC_COPY 16
+INIT_XMM aligned, sse2
+MC_COPY 16
;=============================================================================
; prefetch
;=============================================================================
-; FIXME assumes 64 byte cachelines
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
-; void prefetch_fenc( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, int stride_y,
+; pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
+
+%macro PREFETCH_FENC 1
%ifdef ARCH_X86_64
-cglobal prefetch_fenc_mmxext, 5,5
+cglobal prefetch_fenc_%1, 5,5
+ FIX_STRIDES r1d, r3d
and r4d, 3
mov eax, r4d
imul r4d, r1d
- lea r0, [r0+r4*4+64]
+ lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
prefetcht0 [r0+r1]
imul eax, r3d
- lea r2, [r2+rax*2+64]
+ lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+%ifidn %1, 422
+ lea r2, [r2+r3*2]
prefetcht0 [r2]
prefetcht0 [r2+r3]
+%endif
RET
%else
-cglobal prefetch_fenc_mmxext, 0,3
+cglobal prefetch_fenc_%1, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*4+64]
+ lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
mov r2, r4m
mov r1, r3m
mov r0, r2m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*2+64]
+ lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
+%ifidn %1, 422
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+%endif
ret
%endif ; ARCH_X86_64
+%endmacro
+
+INIT_MMX mmx2
+PREFETCH_FENC 420
+PREFETCH_FENC 422
;-----------------------------------------------------------------------------
-; void prefetch_ref( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-cglobal prefetch_ref_mmxext, 3,3
+INIT_MMX mmx2
+cglobal prefetch_ref, 3,3
+ FIX_STRIDES r1d
dec r2d
and r2d, r1d
- lea r0, [r0+r2*8+64]
+ lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
lea r2, [r1*3]
prefetcht0 [r0]
prefetcht0 [r0+r1]
%macro UNPACK_UNALIGNED 4
movu %1, [%4+0]
movu %2, [%4+4]
- mova %3, %1
+ punpckhwd %3, %1, %2
punpcklwd %1, %2
- punpckhwd %3, %2
- mova %2, %1
%if mmsize == 8
+ mova %2, %1
punpcklwd %1, %3
punpckhwd %2, %3
%else
- shufps %1, %3, 10001000b
- shufps %2, %3, 11011101b
+ shufps %2, %1, %3, q3131
+ shufps %1, %3, q2020
%endif
%endmacro
%else ; !HIGH_BIT_DEPTH
-%macro UNPACK_UNALIGNED_MEM 3
+%macro UNPACK_UNALIGNED 3
+%if mmsize == 8 || cpuflag(misalign)
punpcklwd %1, %3
-%endmacro
-
-%macro UNPACK_UNALIGNED_LOAD 3
+%else
movh %2, %3
punpcklwd %1, %2
+%endif
%endmacro
%endif ; HIGH_BIT_DEPTH
; int dx, int dy,
; int width, int height )
;-----------------------------------------------------------------------------
-%macro MC_CHROMA 1
-cglobal mc_chroma_%1, 0,6
+%macro MC_CHROMA 0
+cglobal mc_chroma, 0,6
MC_CHROMA_START
FIX_STRIDES r4
and r5d, 7
%if mmsize==8
.skip_prologue:
%else
- jl mc_chroma_mmxext %+ .skip_prologue
+ jl mc_chroma_mmx2 %+ .skip_prologue
WIN64_SPILL_XMM 9
%endif
movd m5, t2d
pxor m6, m6
punpcklbw m5, m6
%if mmsize==8
- pshufw m7, m5, 0xee
- pshufw m6, m5, 0x00
- pshufw m5, m5, 0x55
+ pshufw m7, m5, q3232
+ pshufw m6, m5, q0000
+ pshufw m5, m5, q1111
jge .width4
%else
%ifdef WIN64
cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
%endif
- pshufd m7, m5, 0x55
+ pshufd m7, m5, q1111
punpcklwd m5, m5
- pshufd m6, m5, 0x00
- pshufd m5, m5, 0x55
+ pshufd m6, m5, q0000
+ pshufd m5, m5, q1111
jg .width8
%endif
%ifdef HIGH_BIT_DEPTH
pmaddwd m0, m7
pmaddwd m1, m7
packssdw m0, m1
- SWAP m3, m0
+ SWAP 3, 0
ALIGN 4
.loop2:
%ifdef HIGH_BIT_DEPTH
.width8:
%ifdef ARCH_X86_64
%define multy0 m8
- SWAP m8, m5
+ SWAP 8, 5
%else
%define multy0 r0m
mova multy0, m5
movu m1, [r3+mmsize/2]
UNPACK_UNALIGNED m0, m2, [r3+2]
UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- mova m2, m0
- mova m3, m1
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
pand m0, [pw_00ff]
pand m1, [pw_00ff]
- psrlw m2, 8
- psrlw m3, 8
%endif
pmaddwd m0, m7
pmaddwd m2, m7
pmaddwd m3, m7
packssdw m0, m2
packssdw m1, m3
- SWAP m4, m0
- SWAP m5, m1
+ SWAP 4, 0
+ SWAP 5, 1
add r3, r4
ALIGN 4
.loop4:
movu m1, [r3+mmsize/2]
UNPACK_UNALIGNED m0, m2, [r3+2]
UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
- mova m2, m0
- mova m3, m1
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
pand m0, [pw_00ff]
pand m1, [pw_00ff]
- psrlw m2, 8
- psrlw m3, 8
pmaddwd m0, m7
pmaddwd m2, m7
pmaddwd m1, m7
pmullw m4, m6
pmullw m5, m6
mova m2, [pw_32]
- mova m3, m2
+ paddw m3, m2, m5
paddw m2, m4
- paddw m3, m5
mova m4, m0
mova m5, m1
pmullw m0, multy0
%else ; !HIGH_BIT_DEPTH
packuswb m0, m1
%if mmsize==8
- pshufw m1, m0, 0x8
- pshufw m0, m0, 0xd
+ pshufw m1, m0, q0020
+ pshufw m0, m0, q0031
movd [r0], m1
movd [r1], m0
%else
- pshufd m0, m0, 0xd8
+ pshufd m0, m0, q3120
movq [r0], m0
movhps [r1], m0
%endif
movhps m0, [r3]
movhps m1, [r3+r6]
%endif
- mova m2, m0
- mova m3, m1
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
pand m0, [pw_00ff]
pand m1, [pw_00ff]
- psrlw m2, 8
- psrlw m3, 8
%endif ; HIGH_BIT_DEPTH
pmullw m0, m4
pmullw m1, m5
%endmacro ; MC_CHROMA
-%macro MC_CHROMA_SSSE3 0-1
-INIT_XMM
-cglobal mc_chroma_ssse3%1, 0,6,9
+%macro MC_CHROMA_SSSE3 0
+cglobal mc_chroma, 0,6,9
MC_CHROMA_START
and r5d, 7
and t2d, 7
imul r5d, t0d ; (x*255+8)*(8-y)
movd m6, t2d
movd m7, r5d
-%ifidn %1, _cache64
+%if cpuflag(cache64)
mov t0d, r3d
and t0d, 7
%ifdef PIC
pshufb m1, m5
movu m3, [r3+r4*2]
pshufb m3, m5
- mova m2, m1
mova m4, m3
pmaddubsw m0, m7
+ pmaddubsw m2, m1, m7
pmaddubsw m1, m6
- pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, [pw_32]
paddw m2, [pw_32]
movu m1, [r3+8]
pshufb m1, m5
%ifdef ARCH_X86_64
- SWAP m8, m6
+ SWAP 8, 6
%define mult1 m8
%else
mova r0m, m6
psrlw m0, 6
psrlw m1, 6
packuswb m0, m1
- pshufd m0, m0, 0xd8
+ pshufd m0, m0, q3120
movq [r0], m0
movhps [r1], m0
psrlw m2, 6
psrlw m3, 6
packuswb m2, m3
- pshufd m2, m2, 0xd8
+ pshufd m2, m2, q3120
movq [r0+r2], m2
movhps [r1+r2], m2
lea r3, [r3+r4*2]
%endmacro
%ifdef HIGH_BIT_DEPTH
-INIT_MMX
-MC_CHROMA mmxext
-INIT_XMM
-MC_CHROMA sse2
+INIT_MMX mmx2
+MC_CHROMA
+INIT_XMM sse2
+MC_CHROMA
+INIT_XMM avx
+MC_CHROMA
%else ; !HIGH_BIT_DEPTH
-INIT_MMX
-%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
-MC_CHROMA mmxext
-INIT_XMM
-MC_CHROMA sse2_misalign
-%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
-MC_CHROMA sse2
+INIT_MMX mmx2
+MC_CHROMA
+INIT_XMM sse2, misalign
+MC_CHROMA
+INIT_XMM sse2
+MC_CHROMA
+INIT_XMM ssse3
+MC_CHROMA_SSSE3
+INIT_XMM ssse3, cache64
MC_CHROMA_SSSE3
-MC_CHROMA_SSSE3 _cache64
+INIT_XMM avx
+MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
%endif ; HIGH_BIT_DEPTH