%endif
%macro AVG_END 0
- sub eax, 2
lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
+ sub eax, 2
jg .height_loop
REP_RET
%endmacro
%endif
%macro WEIGHTER 1
- cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
+ cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
INIT_MMX mmx2
AVG_FUNC 4, movq, movq
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
INIT_XMM sse2
AVG_FUNC 4, movq, movq
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
INIT_MMX mmx2
AVG_FUNC 4, movd, movd
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
AVGH 8, 8
AVGH 8, 4
INIT_MMX ssse3
+AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
-cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16)
+cglobal pixel_avg2_w%1, 6,7,4
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
%endif
mova [r0], m0
mova [r0+r1*2], m1
- sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
+ sub r5d, 2
jg .height_loop
REP_RET
%endmacro
%macro AVG2_W_TWO 3
-cglobal pixel_avg2_w%1, 6,7,8*(mmsize/16)
+cglobal pixel_avg2_w%1, 6,7,8
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
%3 [r0+mmsize], m1
mova [r0+r1*2], m2
%3 [r0+r1*2+mmsize], m3
- sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
+ sub r5d, 2
jg .height_loop
REP_RET
%endmacro
mova [r0+r1*2+ 0], m3
mova [r0+r1*2+ 8], m4
movh [r0+r1*2+16], m5
- sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
+ sub r5d, 2
jg .height_loop
REP_RET
mova [r0+r1*2+ 8], m5
mova [r0+r1*2+16], m6
mova [r0+r1*2+24], m7
- sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
+ sub r5d, 2
jg .height_loop
REP_RET
mova [r0+16], m2
mova [r0+24], m3
movh [r0+32], m4
- sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
+ dec r5d
jg .height_loop
REP_RET
mova [r0+ 0], m0
mova [r0+16], m1
movh [r0+32], m2
- sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
+ dec r5d
jg .height_loop
REP_RET
%endif ; HIGH_BIT_DEPTH
; pixel copy
;=============================================================================
-%macro COPY4 2-*
+%macro COPY1 2
movu m0, [r2]
movu m1, [r2+r3]
movu m2, [r2+r3*2]
mova [r0+%1], m3
%endmacro
-%macro COPY_ONE 4
- COPY4 %1, %2
+%macro COPY2 2-4 0, 1
+ movu m0, [r2+%3*mmsize]
+ movu m1, [r2+%4*mmsize]
+ movu m2, [r2+r3+%3*mmsize]
+ movu m3, [r2+r3+%4*mmsize]
+ movu m4, [r2+r3*2+%3*mmsize]
+ movu m5, [r2+r3*2+%4*mmsize]
+ movu m6, [r2+%2+%3*mmsize]
+ movu m7, [r2+%2+%4*mmsize]
+ mova [r0+%3*mmsize], m0
+ mova [r0+%4*mmsize], m1
+ mova [r0+r1+%3*mmsize], m2
+ mova [r0+r1+%4*mmsize], m3
+ mova [r0+r1*2+%3*mmsize], m4
+ mova [r0+r1*2+%4*mmsize], m5
+ mova [r0+%1+%3*mmsize], m6
+ mova [r0+%1+%4*mmsize], m7
%endmacro
-%macro COPY_TWO 4
- movu m0, [r2+%3]
- movu m1, [r2+%4]
- movu m2, [r2+r3+%3]
- movu m3, [r2+r3+%4]
- movu m4, [r2+r3*2+%3]
- movu m5, [r2+r3*2+%4]
- movu m6, [r2+%2+%3]
- movu m7, [r2+%2+%4]
- mova [r0+%3], m0
- mova [r0+%4], m1
- mova [r0+r1+%3], m2
- mova [r0+r1+%4], m3
- mova [r0+r1*2+%3], m4
- mova [r0+r1*2+%4], m5
- mova [r0+%1+%3], m6
- mova [r0+%1+%4], m7
+%macro COPY4 2
+ COPY2 %1, %2, 0, 1
+ COPY2 %1, %2, 2, 3
%endmacro
;-----------------------------------------------------------------------------
%define mova movd
%define movu movd
%endif
- COPY4 r4, r5
+ COPY1 r4, r5
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
.end:
- COPY4 r4, r5
+ COPY1 r4, r5
RET
-%ifdef HIGH_BIT_DEPTH
-cglobal mc_copy_w16_mmx, 5,7
+%macro MC_COPY 1
+%assign %%w %1*SIZEOF_PIXEL/mmsize
+%if %%w > 0
+cglobal mc_copy_w%1, 5,7,8*(%%w/2)
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- COPY_TWO r5, r6, mmsize*0, mmsize*1
- COPY_TWO r5, r6, mmsize*2, mmsize*3
- sub r4d, 4
+ COPY %+ %%w r5, r6
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
- jg .height_loop
- REP_RET
-
-%macro MC_COPY 2
-cglobal mc_copy_w%2, 5,7,%2-8
- FIX_STRIDES r1, r3
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- COPY_%1 r5, r6, 0, mmsize
sub r4d, 4
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
jg .height_loop
REP_RET
+%endif
%endmacro
INIT_MMX mmx
-MC_COPY TWO, 8
-INIT_XMM sse2
-MC_COPY ONE, 8
-MC_COPY TWO, 16
-INIT_XMM aligned, sse2
-MC_COPY TWO, 16
-%endif ; HIGH_BIT_DEPTH
-
-%ifndef HIGH_BIT_DEPTH
-%macro MC_COPY 2
-cglobal mc_copy_w%2, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- %1 r5, r6, 0, mmsize
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-MC_COPY COPY4, 8
-MC_COPY COPY_TWO, 16
+MC_COPY 8
+MC_COPY 16
INIT_XMM sse2
-MC_COPY COPY4, 16
-; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
-; but with SSE3 the overhead is zero, so there's no reason not to include it.
-INIT_XMM sse3
-MC_COPY COPY4, 16
+MC_COPY 8
+MC_COPY 16
INIT_XMM aligned, sse2
-MC_COPY COPY4, 16
-%endif ; !HIGH_BIT_DEPTH
+MC_COPY 16
;=============================================================================
; prefetch
;=============================================================================
-; FIXME assumes 64 byte cachelines
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
-; void prefetch_fenc( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, int stride_y,
+; pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
-INIT_MMX
+
+%macro PREFETCH_FENC 1
%ifdef ARCH_X86_64
-cglobal prefetch_fenc_mmx2, 5,5
+cglobal prefetch_fenc_%1, 5,5
+ FIX_STRIDES r1d, r3d
and r4d, 3
mov eax, r4d
imul r4d, r1d
- lea r0, [r0+r4*4+64]
+ lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
prefetcht0 [r0+r1]
imul eax, r3d
- lea r2, [r2+rax*2+64]
+ lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+%ifidn %1, 422
+ lea r2, [r2+r3*2]
prefetcht0 [r2]
prefetcht0 [r2+r3]
+%endif
RET
%else
-cglobal prefetch_fenc_mmx2, 0,3
+cglobal prefetch_fenc_%1, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*4+64]
+ lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
mov r2, r4m
mov r1, r3m
mov r0, r2m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*2+64]
+ lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
+%ifidn %1, 422
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+%endif
ret
%endif ; ARCH_X86_64
+%endmacro
+
+INIT_MMX mmx2
+PREFETCH_FENC 420
+PREFETCH_FENC 422
;-----------------------------------------------------------------------------
-; void prefetch_ref( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
-cglobal prefetch_ref_mmx2, 3,3
+INIT_MMX mmx2
+cglobal prefetch_ref, 3,3
+ FIX_STRIDES r1d
dec r2d
and r2d, r1d
- lea r0, [r0+r2*8+64]
+ lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
lea r2, [r1*3]
prefetcht0 [r0]
prefetcht0 [r0+r1]
punpcklwd %1, %3
punpckhwd %2, %3
%else
- shufps %2, %1, %3, 11011101b
- shufps %1, %3, 10001000b
+ shufps %2, %1, %3, q3131
+ shufps %1, %3, q2020
%endif
%endmacro
%else ; !HIGH_BIT_DEPTH
pxor m6, m6
punpcklbw m5, m6
%if mmsize==8
- pshufw m7, m5, 0xee
- pshufw m6, m5, 0x00
- pshufw m5, m5, 0x55
+ pshufw m7, m5, q3232
+ pshufw m6, m5, q0000
+ pshufw m5, m5, q1111
jge .width4
%else
%ifdef WIN64
cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
%endif
- pshufd m7, m5, 0x55
+ pshufd m7, m5, q1111
punpcklwd m5, m5
- pshufd m6, m5, 0x00
- pshufd m5, m5, 0x55
+ pshufd m6, m5, q0000
+ pshufd m5, m5, q1111
jg .width8
%endif
%ifdef HIGH_BIT_DEPTH
%else ; !HIGH_BIT_DEPTH
packuswb m0, m1
%if mmsize==8
- pshufw m1, m0, 0x8
- pshufw m0, m0, 0xd
+ pshufw m1, m0, q0020
+ pshufw m0, m0, q0031
movd [r0], m1
movd [r1], m0
%else
- pshufd m0, m0, 0xd8
+ pshufd m0, m0, q3120
movq [r0], m0
movhps [r1], m0
%endif
psrlw m0, 6
psrlw m1, 6
packuswb m0, m1
- pshufd m0, m0, 0xd8
+ pshufd m0, m0, q3120
movq [r0], m0
movhps [r1], m0
psrlw m2, 6
psrlw m3, 6
packuswb m2, m3
- pshufd m2, m2, 0xd8
+ pshufd m2, m2, q3120
movq [r0+r2], m2
movhps [r1+r2], m2
lea r3, [r3+r4*2]