;***************************************************************************** ;* mc-a.asm: h264 encoder library ;***************************************************************************** ;* Copyright (C) 2003-2008 x264 project ;* ;* Authors: Loren Merritt ;* Laurent Aimar ;* Min Chen ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. ;***************************************************************************** %include "x86inc.asm" SECTION_RODATA pw_4: times 4 dw 4 pw_8: times 4 dw 8 pw_32: times 4 dw 32 pw_64: times 4 dw 64 SECTION .text ;============================================================================= ; pixel avg ;============================================================================= ;----------------------------------------------------------------------------- ; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride, ; uint8_t *src, int src_stride ); ;----------------------------------------------------------------------------- %macro AVGH 2 %assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space cglobal x264_pixel_avg_%1x%2_mmxext mov eax, %2 jmp x264_pixel_avg_w%1_mmxext %assign function_align 16 %endmacro ;----------------------------------------------------------------------------- ; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride, ; uint8_t *src, int src_stride, ; int height ); ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 %define t0 r0 %define t1 r1 %define t2 r2 %define t3 r3 %macro AVG_START 1 cglobal %1, 4,5 .height_loop: %endmacro %else %define t0 r1 %define t1 r2 %define t2 r3 %define t3 r4 %macro AVG_START 1 cglobal %1, 0,5 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m .height_loop: %endmacro %endif %macro AVG_END 0 sub eax, 2 lea t2, [t2+t3*2] lea t0, [t0+t1*2] jg .height_loop REP_RET %endmacro AVG_START x264_pixel_avg_w4_mmxext movd mm0, [t2] movd mm1, [t2+t3] pavgb mm0, [t0] pavgb mm1, [t0+t1] movd [t0], mm0 movd [t0+t1], mm1 AVG_END AVGH 4, 8 AVGH 4, 4 AVGH 4, 2 AVG_START x264_pixel_avg_w8_mmxext movq mm0, [t2] movq mm1, [t2+t3] pavgb mm0, [t0] pavgb mm1, [t0+t1] movq [t0], mm0 movq [t0+t1], mm1 AVG_END AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 AVG_START x264_pixel_avg_w16_mmxext movq mm0, [t2 ] movq mm1, [t2+8] movq mm2, [t2+t3 ] movq mm3, [t2+t3+8] pavgb mm0, [t0 ] pavgb mm1, [t0+8] pavgb mm2, [t0+t1 ] pavgb mm3, [t0+t1+8] movq [t0 ], mm0 movq [t0+8], mm1 movq [t0+t1 ], mm2 movq [t0+t1+8], mm3 AVG_END AVGH 16, 16 AVGH 16, 8 AVG_START x264_pixel_avg_w16_sse2 movdqu xmm0, [t2] movdqu xmm1, [t2+t3] pavgb xmm0, [t0] pavgb xmm1, [t0+t1] movdqa [t0], xmm0 movdqa [t0+t1], xmm1 AVG_END ;============================================================================= ; pixel avg2 ;============================================================================= ;----------------------------------------------------------------------------- ; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride, ; uint8_t *src1, int src_stride, ; uint8_t *src2, int height ); ;----------------------------------------------------------------------------- %macro AVG2_W8 2 cglobal x264_pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: %2 mm0, [r2] %2 mm1, [r2+r3] pavgb mm0, [r2+r4] pavgb mm1, [r2+r6] %2 [r0], mm0 %2 [r0+r1], mm1 sub r5d, 2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] jg .height_loop REP_RET %endmacro AVG2_W8 4, movd AVG2_W8 8, movq %macro AVG2_W16 2 cglobal x264_pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: movq mm0, [r2] %2 mm1, [r2+8] movq mm2, [r2+r3] %2 mm3, [r2+r3+8] pavgb mm0, [r2+r4] pavgb mm1, [r2+r4+8] pavgb mm2, [r2+r6] pavgb mm3, [r2+r6+8] movq [r0], mm0 %2 [r0+8], mm1 movq [r0+r1], mm2 %2 [r0+r1+8], mm3 lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop REP_RET %endmacro AVG2_W16 12, movd AVG2_W16 16, movq cglobal x264_pixel_avg2_w20_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3] .height_loop: movq mm0, [r2] movq mm1, [r2+8] movd mm2, [r2+16] movq mm3, [r2+r3] movq mm4, [r2+r3+8] movd mm5, [r2+r3+16] pavgb mm0, [r2+r4] pavgb mm1, [r2+r4+8] pavgb mm2, [r2+r4+16] pavgb mm3, [r2+r6] pavgb mm4, [r2+r6+8] pavgb mm5, [r2+r6+16] movq [r0], mm0 movq [r0+8], mm1 movd [r0+16], mm2 movq [r0+r1], mm3 movq [r0+r1+8], mm4 movd [r0+r1+16], mm5 lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop REP_RET ;============================================================================= ; pixel copy ;============================================================================= %macro COPY4 3 %1 mm0, [r2] %1 mm1, [r2+r3] %1 mm2, [r2+r3*2] %1 mm3, [r2+%3] %1 [r0], mm0 %1 [r0+r1], mm1 %1 [r0+r1*2], mm2 %1 [r0+%2], mm3 %endmacro ;----------------------------------------------------------------------------- ; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride, ; uint8_t *src, int i_src_stride, int i_height ) ;----------------------------------------------------------------------------- cglobal x264_mc_copy_w4_mmx, 4,6 cmp r4m, dword 4 lea r5, [r3*3] lea r4, [r1*3] je .end COPY4 movd, r4, r5 lea r2, [r2+r3*4] lea r0, [r0+r1*4] .end: COPY4 movd, r4, r5 RET cglobal x264_mc_copy_w8_mmx, 5,7 lea r6, [r3*3] lea r5, [r1*3] .height_loop: COPY4 movq, r5, r6 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop REP_RET cglobal x264_mc_copy_w16_mmx, 5,7 lea r6, [r3*3] lea r5, [r1*3] .height_loop: movq mm0, [r2] movq mm1, [r2+8] movq mm2, [r2+r3] movq mm3, [r2+r3+8] movq mm4, [r2+r3*2] movq mm5, [r2+r3*2+8] movq mm6, [r2+r6] movq mm7, [r2+r6+8] movq [r0], mm0 movq [r0+8], mm1 movq [r0+r1], mm2 movq [r0+r1+8], mm3 movq [r0+r1*2], mm4 movq [r0+r1*2+8], mm5 movq [r0+r5], mm6 movq [r0+r5+8], mm7 lea r2, [r2+r3*4] lea r0, [r0+r1*4] sub r4d, 4 jg .height_loop REP_RET ;============================================================================= ; weighted prediction ;============================================================================= ; implicit bipred only: ; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 %macro BIWEIGHT_4P_MMX 2 movd mm0, %1 movd mm1, %2 punpcklbw mm0, mm7 punpcklbw mm1, mm7 pmullw mm0, mm4 pmullw mm1, mm5 paddw mm0, mm1 paddw mm0, mm6 psraw mm0, 6 pmaxsw mm0, mm7 packuswb mm0, mm0 movd %1, mm0 %endmacro %macro BIWEIGHT_START_MMX 1 %ifidn r4m, r4d movd mm4, r4m pshufw mm4, mm4, 0 ; weight_dst %else pshufw mm4, r4m, 0 %endif picgetgot r4 movq mm5, [pw_64 GLOBAL] psubw mm5, mm4 ; weight_src movq mm6, [pw_32 GLOBAL] ; rounding pxor mm7, mm7 %if %1 %ifidn r5m, r5d %define t0 r5d %else %define t0 r4d mov r4d, r5m %endif %endif .height_loop: %endmacro ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int ) ;----------------------------------------------------------------------------- cglobal x264_pixel_avg_weight_w16_mmxext, 4,5 BIWEIGHT_START_MMX 1 BIWEIGHT_4P_MMX [r0 ], [r2 ] BIWEIGHT_4P_MMX [r0+ 4], [r2+ 4] BIWEIGHT_4P_MMX [r0+ 8], [r2+ 8] BIWEIGHT_4P_MMX [r0+12], [r2+12] add r0, r1 add r2, r3 dec t0 jg .height_loop REP_RET ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int ) ;----------------------------------------------------------------------------- cglobal x264_pixel_avg_weight_w8_mmxext, 4,5 BIWEIGHT_START_MMX 1 BIWEIGHT_4P_MMX [r0 ], [r2 ] BIWEIGHT_4P_MMX [r0+4], [r2+4] add r0, r1 add r2, r3 dec t0 jg .height_loop REP_RET ;----------------------------------------------------------------------------- ; int x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int ) ;----------------------------------------------------------------------------- cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4 BIWEIGHT_START_MMX 0 BIWEIGHT_4P_MMX [r0 ], [r2 ] BIWEIGHT_4P_MMX [r0+r1 ], [r2+r3 ] BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2] add r0, r1 add r2, r3 BIWEIGHT_4P_MMX [r0+r1*2], [r2+r3*2] RET ;============================================================================= ; prefetch ;============================================================================= ; FIXME assumes 64 byte cachelines ;----------------------------------------------------------------------------- ; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y, ; uint8_t *pix_uv, int stride_uv, int mb_x ) ;----------------------------------------------------------------------------- %ifdef ARCH_X86_64 cglobal x264_prefetch_fenc_mmxext, 5,5 mov eax, r4d and eax, 3 imul eax, r1d lea r0, [r0+rax*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] and r4d, 6 imul r4d, r3d lea r2, [r2+r4+64] prefetcht0 [r2] prefetcht0 [r2+r3] ret %else cglobal x264_prefetch_fenc_mmxext mov r2, [esp+20] mov r1, [esp+8] mov r0, [esp+4] and r2, 3 imul r2, r1 lea r0, [r0+r2*4+64] prefetcht0 [r0] prefetcht0 [r0+r1] lea r0, [r0+r1*2] prefetcht0 [r0] prefetcht0 [r0+r1] mov r2, [esp+20] mov r1, [esp+16] mov r0, [esp+12] and r2, 6 imul r2, r1 lea r0, [r0+r2+64] prefetcht0 [r0] prefetcht0 [r0+r1] ret %endif ; ARCH_X86_64 ;----------------------------------------------------------------------------- ; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity ) ;----------------------------------------------------------------------------- cglobal x264_prefetch_ref_mmxext, 3,3 dec r2d and r2d, r1d lea r0, [r0+r2*8+64] lea r2, [r1*3] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] lea r0, [r0+r1*4] prefetcht0 [r0] prefetcht0 [r0+r1] prefetcht0 [r0+r1*2] prefetcht0 [r0+r2] ret ;============================================================================= ; chroma MC ;============================================================================= ;----------------------------------------------------------------------------- ; void x264_mc_chroma_mmxext( uint8_t *dst, int i_dst_stride, ; uint8_t *src, int i_src_stride, ; int dx, int dy, ; int i_width, int i_height ) ;----------------------------------------------------------------------------- cglobal x264_mc_chroma_mmxext, 0,6,1 %ifdef ARCH_X86_64 %define t0 r10d %else %define t0 r1d %endif movifnidn r2d, r2m movifnidn r3d, r3m movifnidn r4d, r4m movifnidn r5d, r5m mov eax, r5d mov t0, r4d sar eax, 3 sar t0, 3 imul eax, r3d pxor mm3, mm3 add eax, t0 movsxdifnidn rax, eax add r2, rax ; src += (dx>>3) + (dy>>3) * src_stride and r4d, 7 ; dx &= 7 je .mc1d and r5d, 7 ; dy &= 7 je .mc1d movd mm0, r4d movd mm1, r5d pshufw mm5, mm0, 0 ; mm5 = dx pshufw mm6, mm1, 0 ; mm6 = dy movq mm4, [pw_8 GLOBAL] movq mm0, mm4 psubw mm4, mm5 ; mm4 = 8-dx psubw mm0, mm6 ; mm0 = 8-dy movq mm7, mm5 pmullw mm5, mm0 ; mm5 = dx*(8-dy) = cB pmullw mm7, mm6 ; mm7 = dx*dy = cD pmullw mm6, mm4 ; mm6 = (8-dx)*dy = cC pmullw mm4, mm0 ; mm4 = (8-dx)*(8-dy) = cA mov r4d, r7m %ifdef ARCH_X86_64 mov r10, r0 mov r11, r2 %else mov r0, r0m mov r1, r1m mov r5, r2 %endif ALIGN 4 .height_loop movd mm1, [r2+r3] movd mm0, [r2] punpcklbw mm1, mm3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4 punpcklbw mm0, mm3 pmullw mm1, mm6 ; 2nd line * cC pmullw mm0, mm4 ; 1st line * cA paddw mm0, mm1 ; mm0 <- result movd mm2, [r2+1] movd mm1, [r2+r3+1] punpcklbw mm2, mm3 punpcklbw mm1, mm3 paddw mm0, [pw_32 GLOBAL] pmullw mm2, mm5 ; line * cB pmullw mm1, mm7 ; line * cD paddw mm0, mm2 paddw mm0, mm1 psrlw mm0, 6 packuswb mm0, mm3 ; 00 00 00 00 px1 px2 px3 px4 movd [r0], mm0 add r2, r3 add r0, r1 ; i_dst_stride dec r4d jnz .height_loop sub dword r6m, 8 jnz .finish ; width != 8 so assume 4 %ifdef ARCH_X86_64 lea r0, [r10+4] ; dst lea r2, [r11+4] ; src %else mov r0, r0m lea r2, [r5+4] add r0, 4 %endif mov r4d, r7m ; i_height jmp .height_loop .finish REP_RET ALIGN 4 .mc1d mov eax, r4d or eax, r5d and eax, 7 cmp r4d, 0 mov r5d, 1 cmove r5, r3 ; pel_offset = dx ? 1 : src_stride movd mm6, eax movq mm5, [pw_8 GLOBAL] pshufw mm6, mm6, 0 movq mm7, [pw_4 GLOBAL] psubw mm5, mm6 cmp dword r6m, 8 movifnidn r0d, r0m movifnidn r1d, r1m mov r4d, r7m je .height_loop1_w8 ALIGN 4 .height_loop1_w4 movd mm0, [r2+r5] movd mm1, [r2] punpcklbw mm0, mm3 punpcklbw mm1, mm3 pmullw mm0, mm6 pmullw mm1, mm5 paddw mm0, mm7 paddw mm0, mm1 psrlw mm0, 3 packuswb mm0, mm3 movd [r0], mm0 add r2, r3 add r0, r1 dec r4d jnz .height_loop1_w4 REP_RET ALIGN 4 .height_loop1_w8 movq mm0, [r2+r5] movq mm1, [r2] movq mm2, mm0 movq mm4, mm1 punpcklbw mm0, mm3 punpcklbw mm1, mm3 punpckhbw mm2, mm3 punpckhbw mm4, mm3 pmullw mm0, mm6 pmullw mm1, mm5 pmullw mm2, mm6 pmullw mm4, mm5 paddw mm0, mm7 paddw mm2, mm7 paddw mm0, mm1 paddw mm2, mm4 psrlw mm0, 3 psrlw mm2, 3 packuswb mm0, mm2 movq [r0], mm0 add r2, r3 add r0, r1 dec r4d jnz .height_loop1_w8 REP_RET