; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
%endmacro
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
; go to next line
add r0, r1
add r2, r3
- dec r4 ; next row
+ dec r4d ; next row
jg .nextrow
REP_RET
%endmacro
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub r4, 2
+ sub r4d, 2
jg .nextrow
REP_RET
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub r4, 2
+ sub r4d, 2
jg .nextrow
REP_RET
%endmacro
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub r4, 2
+ sub r4d, 2
jg .nextrow
REP_RET
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
- sub r4, 2
+ sub r4d, 2
jg .nextrow
REP_RET
%endmacro
SWAP %1, %4, %3
%endmacro
-INIT_MMX
-cglobal vp8_luma_dc_wht_mmx, 2,3
+%macro VP8_DC_WHT 1
+cglobal vp8_luma_dc_wht_%1, 2,3
movq m0, [r1]
movq m1, [r1+8]
movq m2, [r1+16]
movq m3, [r1+24]
+%ifidn %1, sse
+ xorps xmm0, xmm0
+ movaps [r1+ 0], xmm0
+ movaps [r1+16], xmm0
+%else
+ pxor m4, m4
+ movq [r1+ 0], m4
+ movq [r1+ 8], m4
+ movq [r1+16], m4
+ movq [r1+24], m4
+%endif
HADAMARD4_1D 0, 1, 2, 3
TRANSPOSE4x4W 0, 1, 2, 3, 4
paddw m0, [pw_3]
SCATTER_WHT 0, 1, 0
SCATTER_WHT 2, 3, 2
RET
+%endmacro
+
+INIT_MMX
+VP8_DC_WHT mmx
+VP8_DC_WHT sse
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
psrldq m%2, 4
%if %10 == 8
movd [%5+%8*2], m%1
- movd %5, m%3
+ movd %5d, m%3
%endif
psrldq m%3, 4
psrldq m%4, 4
; 4 is a pointer to the destination's 4th line
; 5/6 is -stride and +stride
%macro WRITE_2x4W 6
- movd %3, %1
+ movd %3d, %1
punpckhdq %1, %1
mov [%4+%5*4], %3w
shr %3, 16
add %4, %6
mov [%4+%5*4], %3w
- movd %3, %1
+ movd %3d, %1
add %4, %5
mov [%4+%5*2], %3w
shr %3, 16
mov [%4+%5 ], %3w
- movd %3, %2
+ movd %3d, %2
punpckhdq %2, %2
mov [%4 ], %3w
shr %3, 16
mov [%4+%6 ], %3w
- movd %3, %2
+ movd %3d, %2
add %4, %6
mov [%4+%6 ], %3w
shr %3, 16
%endmacro
%macro WRITE_8W_SSE2 5
- movd %2, %1
+ movd %2d, %1
psrldq %1, 4
mov [%3+%4*4], %2w
shr %2, 16
add %3, %5
mov [%3+%4*4], %2w
- movd %2, %1
+ movd %2d, %1
psrldq %1, 4
add %3, %4
mov [%3+%4*2], %2w
shr %2, 16
mov [%3+%4 ], %2w
- movd %2, %1
+ movd %2d, %1
psrldq %1, 4
mov [%3 ], %2w
shr %2, 16
mov [%3+%5 ], %2w
- movd %2, %1
+ movd %2d, %1
add %3, %5
mov [%3+%5 ], %2w
shr %2, 16
%endmacro
%macro SPLATB_REG_MMX 2-3
- movd %1, %2
+ movd %1, %2d
punpcklbw %1, %1
punpcklwd %1, %1
punpckldq %1, %1
%endmacro
%macro SPLATB_REG_MMXEXT 2-3
- movd %1, %2
+ movd %1, %2d
punpcklbw %1, %1
pshufw %1, %1, 0x0
%endmacro
%macro SPLATB_REG_SSE2 2-3
- movd %1, %2
+ movd %1, %2d
punpcklbw %1, %1
pshuflw %1, %1, 0x0
punpcklqdq %1, %1
%endmacro
%macro SPLATB_REG_SSSE3 3
- movd %1, %2
+ movd %1, %2d
pshufb %1, %3
%endmacro
-%macro SIMPLE_LOOPFILTER 3
-cglobal vp8_%2_loop_filter_simple_%1, 3, %3
+%macro SIMPLE_LOOPFILTER 4
+cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4
%if mmsize == 8 ; mmx/mmxext
mov r3, 2
%endif
INIT_MMX
%define SPLATB_REG SPLATB_REG_MMX
-SIMPLE_LOOPFILTER mmx, v, 4
-SIMPLE_LOOPFILTER mmx, h, 5
+SIMPLE_LOOPFILTER mmx, v, 4, 0
+SIMPLE_LOOPFILTER mmx, h, 5, 0
%define SPLATB_REG SPLATB_REG_MMXEXT
-SIMPLE_LOOPFILTER mmxext, v, 4
-SIMPLE_LOOPFILTER mmxext, h, 5
+SIMPLE_LOOPFILTER mmxext, v, 4, 0
+SIMPLE_LOOPFILTER mmxext, h, 5, 0
INIT_XMM
%define SPLATB_REG SPLATB_REG_SSE2
%define WRITE_8W WRITE_8W_SSE2
-SIMPLE_LOOPFILTER sse2, v, 3
-SIMPLE_LOOPFILTER sse2, h, 5
+SIMPLE_LOOPFILTER sse2, v, 3, 8
+SIMPLE_LOOPFILTER sse2, h, 5, 8
%define SPLATB_REG SPLATB_REG_SSSE3
-SIMPLE_LOOPFILTER ssse3, v, 3
-SIMPLE_LOOPFILTER ssse3, h, 5
+SIMPLE_LOOPFILTER ssse3, v, 3, 8
+SIMPLE_LOOPFILTER ssse3, h, 5, 8
%define WRITE_8W WRITE_8W_SSE4
-SIMPLE_LOOPFILTER sse4, h, 5
+SIMPLE_LOOPFILTER sse4, h, 5, 8
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,