X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=common%2Fx86%2Fmc-a2.asm;h=3a1ea14f2db20e5d856b0cc14958cb020d1fbe5d;hb=648147bbc16722e67173c588c662098267294d93;hp=2824f26b20db622d529afda48f8d1e65ce05b8b3;hpb=6ecfa83c34b665ca9e98814babf4bd3e09ac6706;p=x264 diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 2824f26b..3a1ea14f 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -825,11 +825,13 @@ HPEL ssse3 %endif ; !HIGH_BIT_DEPTH ;----------------------------------------------------------------------------- -; void plane_copy_core( uint8_t *dst, int i_dst, -; uint8_t *src, int i_src, int w, int h) +; void plane_copy_core( pixel *dst, int i_dst, +; pixel *src, int i_src, int w, int h) ;----------------------------------------------------------------------------- ; assumes i_dst and w are multiples of 16, and i_dst>w +INIT_MMX cglobal plane_copy_core_mmxext, 6,7 + FIX_STRIDES r1d, r3d, r4d movsxdifnidn r1, r1d movsxdifnidn r3, r3d movsxdifnidn r4, r4d @@ -840,22 +842,22 @@ cglobal plane_copy_core_mmxext, 6,7 sub r6d, 63 .loopx: prefetchnta [r2+256] - movq mm0, [r2 ] - movq mm1, [r2+ 8] - movntq [r0 ], mm0 - movntq [r0+ 8], mm1 - movq mm2, [r2+16] - movq mm3, [r2+24] - movntq [r0+16], mm2 - movntq [r0+24], mm3 - movq mm4, [r2+32] - movq mm5, [r2+40] - movntq [r0+32], mm4 - movntq [r0+40], mm5 - movq mm6, [r2+48] - movq mm7, [r2+56] - movntq [r0+48], mm6 - movntq [r0+56], mm7 + movq m0, [r2 ] + movq m1, [r2+ 8] + movntq [r0 ], m0 + movntq [r0+ 8], m1 + movq m2, [r2+16] + movq m3, [r2+24] + movntq [r0+16], m2 + movntq [r0+24], m3 + movq m4, [r2+32] + movq m5, [r2+40] + movntq [r0+32], m4 + movntq [r0+40], m5 + movq m6, [r2+48] + movq m7, [r2+56] + movntq [r0+48], m6 + movntq [r0+56], m7 add r2, 64 add r0, 64 sub r6d, 64 @@ -864,10 +866,10 @@ cglobal plane_copy_core_mmxext, 6,7 add r6d, 63 jle .end16 .loop16: - movq mm0, [r2 ] - movq mm1, [r2+8] - movntq [r0 ], mm0 - movntq [r0+8], mm1 + movq m0, [r2 ] + movq m1, [r2+8] + movntq [r0 ], m0 + movntq [r0+8], m1 add r2, 16 add r0, 16 sub r6d, 16 @@ -1445,14 +1447,64 @@ cglobal integral_init4v_ssse3, 3,5 mova [%2], m2 %endmacro +%macro FILT8xU 3 + mova m3, [r0+%3+8] + mova m2, [r0+%3] + pavgw m3, [r0+%3+r5+8] + pavgw m2, [r0+%3+r5] + movu m1, [r0+%3+10] + movu m0, [r0+%3+2] + pavgw m1, [r0+%3+r5+10] + pavgw m0, [r0+%3+r5+2] + pavgw m1, m3 + pavgw m0, m2 + mova m3, m1 + mova m2, m0 + pand m1, m7 + pand m0, m7 + psrld m3, 16 + psrld m2, 16 + packssdw m0, m1 + packssdw m2, m3 + movu [%1], m0 + mova [%2], m2 +%endmacro + +%macro FILT8xA 4 + mova m3, [r0+%4+mmsize] + mova m2, [r0+%4] + pavgw m3, [r0+%4+r5+mmsize] + pavgw m2, [r0+%4+r5] + PALIGNR %1, m3, 2, m6 + pavgw %1, m3 + PALIGNR m3, m2, 2, m6 + pavgw m3, m2 + mova m5, m3 + mova m4, %1 + pand m3, m7 + pand %1, m7 + psrld m5, 16 + psrld m4, 16 + packssdw m3, %1 + packssdw m5, m4 + mova [%2], m3 + mova [%3], m5 + mova %1, m2 +%endmacro + ;----------------------------------------------------------------------------- ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, ; int src_stride, int dst_stride, int width, int height ) ;----------------------------------------------------------------------------- -%macro FRAME_INIT_LOWRES 1-2 0 ; FIXME -cglobal frame_init_lowres_core_%1, 6,7,%2 +%macro FRAME_INIT_LOWRES 1 +cglobal frame_init_lowres_core_%1, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise +%ifdef HIGH_BIT_DEPTH + shl dword r6m, 1 + FIX_STRIDES r5d + shl dword r7m, 1 +%endif %ifdef WIN64 - movsxd r5, r5d + movsxd r5, r5d %endif ; src += 2*(height-1)*stride + 2*width mov r6d, r8m @@ -1479,6 +1531,33 @@ cglobal frame_init_lowres_core_%1, 6,7,%2 shl r6d, 1 PUSH r6 %define src_gap [rsp] +%ifdef HIGH_BIT_DEPTH + pcmpeqw m7, m7 + psrld m7, 16 +.vloop: + mov r6d, r7m +%ifnidn %1,mmxext + mova m0, [r0] + mova m1, [r0+r5] + pavgw m0, m1 + pavgw m1, [r0+r5*2] +%endif +.hloop: + sub r0, mmsize*2 + sub r1, mmsize + sub r2, mmsize + sub r3, mmsize + sub r4, mmsize +%ifidn %1,mmxext + FILT8xU r1, r2, 0 + FILT8xU r3, r4, r5 +%else + FILT8xA m0, r1, r2, 0 + FILT8xA m1, r3, r4, r5 +%endif + sub r6d, mmsize + jg .hloop +%else ; !HIGH_BIT_DEPTH %if mmsize == 16 ; adjust for the odd end case mov r6d, r7m @@ -1542,6 +1621,7 @@ cglobal frame_init_lowres_core_%1, 6,7,%2 %endif sub r6d, mmsize jg .hloop +%endif ; HIGH_BIT_DEPTH .skip: mov r6, dst_gap sub r0, src_gap @@ -1563,9 +1643,9 @@ FRAME_INIT_LOWRES mmxext FRAME_INIT_LOWRES cache32_mmxext %endif INIT_XMM -FRAME_INIT_LOWRES sse2, 12 +FRAME_INIT_LOWRES sse2 %define PALIGNR PALIGNR_SSSE3 -FRAME_INIT_LOWRES ssse3, 12 +FRAME_INIT_LOWRES ssse3 ;----------------------------------------------------------------------------- ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,