%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void plane_copy_core( uint8_t *dst, int i_dst,
-; uint8_t *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, int i_dst,
+; pixel *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>w
+INIT_MMX
cglobal plane_copy_core_mmxext, 6,7
+ FIX_STRIDES r1d, r3d, r4d
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
sub r6d, 63
.loopx:
prefetchnta [r2+256]
- movq mm0, [r2 ]
- movq mm1, [r2+ 8]
- movntq [r0 ], mm0
- movntq [r0+ 8], mm1
- movq mm2, [r2+16]
- movq mm3, [r2+24]
- movntq [r0+16], mm2
- movntq [r0+24], mm3
- movq mm4, [r2+32]
- movq mm5, [r2+40]
- movntq [r0+32], mm4
- movntq [r0+40], mm5
- movq mm6, [r2+48]
- movq mm7, [r2+56]
- movntq [r0+48], mm6
- movntq [r0+56], mm7
+ movq m0, [r2 ]
+ movq m1, [r2+ 8]
+ movntq [r0 ], m0
+ movntq [r0+ 8], m1
+ movq m2, [r2+16]
+ movq m3, [r2+24]
+ movntq [r0+16], m2
+ movntq [r0+24], m3
+ movq m4, [r2+32]
+ movq m5, [r2+40]
+ movntq [r0+32], m4
+ movntq [r0+40], m5
+ movq m6, [r2+48]
+ movq m7, [r2+56]
+ movntq [r0+48], m6
+ movntq [r0+56], m7
add r2, 64
add r0, 64
sub r6d, 64
add r6d, 63
jle .end16
.loop16:
- movq mm0, [r2 ]
- movq mm1, [r2+8]
- movntq [r0 ], mm0
- movntq [r0+8], mm1
+ movq m0, [r2 ]
+ movq m1, [r2+8]
+ movntq [r0 ], m0
+ movntq [r0+8], m1
add r2, 16
add r0, 16
sub r6d, 16
mova [%2], m2
%endmacro
+%macro FILT8xU 3
+ mova m3, [r0+%3+8]
+ mova m2, [r0+%3]
+ pavgw m3, [r0+%3+r5+8]
+ pavgw m2, [r0+%3+r5]
+ movu m1, [r0+%3+10]
+ movu m0, [r0+%3+2]
+ pavgw m1, [r0+%3+r5+10]
+ pavgw m0, [r0+%3+r5+2]
+ pavgw m1, m3
+ pavgw m0, m2
+ mova m3, m1
+ mova m2, m0
+ pand m1, m7
+ pand m0, m7
+ psrld m3, 16
+ psrld m2, 16
+ packssdw m0, m1
+ packssdw m2, m3
+ movu [%1], m0
+ mova [%2], m2
+%endmacro
+
+%macro FILT8xA 4
+ mova m3, [r0+%4+mmsize]
+ mova m2, [r0+%4]
+ pavgw m3, [r0+%4+r5+mmsize]
+ pavgw m2, [r0+%4+r5]
+ PALIGNR %1, m3, 2, m6
+ pavgw %1, m3
+ PALIGNR m3, m2, 2, m6
+ pavgw m3, m2
+ mova m5, m3
+ mova m4, %1
+ pand m3, m7
+ pand %1, m7
+ psrld m5, 16
+ psrld m4, 16
+ packssdw m3, %1
+ packssdw m5, m4
+ mova [%2], m3
+ mova [%3], m5
+ mova %1, m2
+%endmacro
+
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
-%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME
-cglobal frame_init_lowres_core_%1, 6,7,%2
+%macro FRAME_INIT_LOWRES 1
+cglobal frame_init_lowres_core_%1, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%ifdef HIGH_BIT_DEPTH
+ shl dword r6m, 1
+ FIX_STRIDES r5d
+ shl dword r7m, 1
+%endif
%ifdef WIN64
- movsxd r5, r5d
+ movsxd r5, r5d
%endif
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
shl r6d, 1
PUSH r6
%define src_gap [rsp]
+%ifdef HIGH_BIT_DEPTH
+ pcmpeqw m7, m7
+ psrld m7, 16
+.vloop:
+ mov r6d, r7m
+%ifnidn %1,mmxext
+ mova m0, [r0]
+ mova m1, [r0+r5]
+ pavgw m0, m1
+ pavgw m1, [r0+r5*2]
+%endif
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+ sub r2, mmsize
+ sub r3, mmsize
+ sub r4, mmsize
+%ifidn %1,mmxext
+ FILT8xU r1, r2, 0
+ FILT8xU r3, r4, r5
+%else
+ FILT8xA m0, r1, r2, 0
+ FILT8xA m1, r3, r4, r5
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%else ; !HIGH_BIT_DEPTH
%if mmsize == 16
; adjust for the odd end case
mov r6d, r7m
%endif
sub r6d, mmsize
jg .hloop
+%endif ; HIGH_BIT_DEPTH
.skip:
mov r6, dst_gap
sub r0, src_gap
FRAME_INIT_LOWRES cache32_mmxext
%endif
INIT_XMM
-FRAME_INIT_LOWRES sse2, 12
+FRAME_INIT_LOWRES sse2
%define PALIGNR PALIGNR_SSSE3
-FRAME_INIT_LOWRES ssse3, 12
+FRAME_INIT_LOWRES ssse3
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,