;***************************************************************************** ;* predict-a.asm: h264 encoder library ;***************************************************************************** ;* Copyright (C) 2005-2008 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or ;* (at your option) any later version. ;* ;* This program is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;* GNU General Public License for more details. ;* ;* You should have received a copy of the GNU General Public License ;* along with this program; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. ;***************************************************************************** %include "x86inc.asm" %macro STORE8x8 2 movq [r0 + 0*FDEC_STRIDE], %1 movq [r0 + 1*FDEC_STRIDE], %1 movq [r0 + 2*FDEC_STRIDE], %1 movq [r0 + 3*FDEC_STRIDE], %1 movq [r0 + 4*FDEC_STRIDE], %2 movq [r0 + 5*FDEC_STRIDE], %2 movq [r0 + 6*FDEC_STRIDE], %2 movq [r0 + 7*FDEC_STRIDE], %2 %endmacro %macro STORE16x16 2 mov r1d, 4 .loop: movq [r0 + 0*FDEC_STRIDE], %1 movq [r0 + 1*FDEC_STRIDE], %1 movq [r0 + 2*FDEC_STRIDE], %1 movq [r0 + 3*FDEC_STRIDE], %1 movq [r0 + 0*FDEC_STRIDE + 8], %2 movq [r0 + 1*FDEC_STRIDE + 8], %2 movq [r0 + 2*FDEC_STRIDE + 8], %2 movq [r0 + 3*FDEC_STRIDE + 8], %2 add r0, 4*FDEC_STRIDE dec r1d jg .loop %endmacro %macro STORE16x16_SSE2 1 mov r1d, 4 .loop: movdqa [r0 + 0*FDEC_STRIDE], %1 movdqa [r0 + 1*FDEC_STRIDE], %1 movdqa [r0 + 2*FDEC_STRIDE], %1 movdqa [r0 + 3*FDEC_STRIDE], %1 add r0, 4*FDEC_STRIDE dec r1d jg .loop %endmacro SECTION_RODATA ALIGN 16 pb_1: times 16 db 1 pw_2: times 4 dw 2 pw_4: times 4 dw 4 pw_8: times 8 dw 8 pw_76543210: pw_3210: dw 0, 1, 2, 3, 4, 5, 6, 7 pb_00s_ff: times 8 db 0 pb_0s_ff: times 7 db 0 db 0xff SECTION .text ; dest, left, right, src, tmp ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 %macro PRED8x8_LOWPASS0 6 mov%6 %5, %2 pavgb %2, %3 pxor %3, %5 mov%6 %1, %4 pand %3, [pb_1 GLOBAL] psubusb %2, %3 pavgb %1, %2 %endmacro %macro PRED8x8_LOWPASS 5 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, q %endmacro %macro PRED8x8_LOWPASS_XMM 5 PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa %endmacro ;----------------------------------------------------------------------------- ; void predict_4x4_ddl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_ddl_mmxext, 1,1,1 sub r0, FDEC_STRIDE movq mm3, [r0] movq mm1, [r0-1] movq mm2, mm3 movq mm4, [pb_0s_ff GLOBAL] psrlq mm2, 8 pand mm4, mm3 por mm2, mm4 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 %assign Y 1 %rep 4 psrlq mm0, 8 movd [r0+Y*FDEC_STRIDE], mm0 %assign Y (Y+1) %endrep RET ;----------------------------------------------------------------------------- ; void predict_4x4_vl_mmxext( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_4x4_vl_mmxext, 1,1,1 movq mm1, [r0-FDEC_STRIDE] movq mm3, mm1 movq mm2, mm1 psrlq mm3, 8 psrlq mm2, 16 movq mm4, mm3 pavgb mm4, mm1 PRED8x8_LOWPASS mm0, mm1, mm2, mm3, mm5 movd [r0+0*FDEC_STRIDE], mm4 movd [r0+1*FDEC_STRIDE], mm0 psrlq mm4, 8 psrlq mm0, 8 movd [r0+2*FDEC_STRIDE], mm4 movd [r0+3*FDEC_STRIDE], mm0 RET ;----------------------------------------------------------------------------- ; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_v_mmxext, 2,2 movq mm0, [r1+16] STORE8x8 mm0, mm0 RET ;----------------------------------------------------------------------------- ; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- cglobal predict_8x8_dc_mmxext, 2,2,1 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r1+7] psadbw mm1, [r1+16] paddw mm0, [pw_8 GLOBAL] paddw mm0, mm1 psrlw mm0, 4 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 RET ;----------------------------------------------------------------------------- ; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge ); ;----------------------------------------------------------------------------- %macro PRED8x8_DC 2 cglobal %1, 2,2,1 pxor mm0, mm0 psadbw mm0, [r1+%2] paddw mm0, [pw_4 GLOBAL] psrlw mm0, 3 pshufw mm0, mm0, 0 packuswb mm0, mm0 STORE8x8 mm0, mm0 RET %endmacro PRED8x8_DC predict_8x8_dc_top_mmxext, 16 PRED8x8_DC predict_8x8_dc_left_mmxext, 7 %ifndef ARCH_X86_64 ; sse2 is faster even on amd, so there's no sense in spending exe size on these ; functions if we know sse2 is available. ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_mmxext, 2,2,1 movq mm5, [r1+16] movq mm2, [r1+17] movq mm3, [r1+23] movq mm4, [r1+25] movq mm1, mm5 psllq mm1, 8 PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+24], mm6 %assign Y 7 %rep 6 movq [r0+Y*FDEC_STRIDE], mm1 movq mm2, mm0 psllq mm1, 8 psrlq mm2, 56 psllq mm0, 8 por mm1, mm2 %assign Y (Y-1) %endrep movq [r0+Y*FDEC_STRIDE], mm1 psllq mm1, 8 psrlq mm0, 56 por mm1, mm0 %assign Y (Y-1) movq [r0+Y*FDEC_STRIDE], mm1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_mmxext, 2,2,1 movq mm1, [r1+7] movq mm2, [r1+9] movq mm3, [r1+15] movq mm4, [r1+17] PRED8x8_LOWPASS mm0, mm1, mm2, [r1+8], mm7 PRED8x8_LOWPASS mm1, mm3, mm4, [r1+16], mm6 %assign Y 7 %rep 6 movq [r0+Y*FDEC_STRIDE], mm0 movq mm2, mm1 psrlq mm0, 8 psllq mm2, 56 psrlq mm1, 8 por mm0, mm2 %assign Y (Y-1) %endrep movq [r0+Y*FDEC_STRIDE], mm0 psrlq mm0, 8 psllq mm1, 56 por mm0, mm1 %assign Y (Y-1) movq [r0+Y*FDEC_STRIDE], mm0 RET %endif ; !ARCH_X86_64 ;----------------------------------------------------------------------------- ; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddl_sse2, 2,2,1 movdqa xmm3, [r1+16] movdqu xmm2, [r1+17] movdqa xmm1, xmm3 pslldq xmm1, 1 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 %assign Y 0 %rep 8 psrldq xmm0, 1 movq [r0+Y*FDEC_STRIDE], xmm0 %assign Y (Y+1) %endrep RET ;----------------------------------------------------------------------------- ; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_ddr_sse2, 2,2,1 movdqu xmm3, [r1+8] movdqu xmm1, [r1+7] movdqa xmm2, xmm3 psrldq xmm2, 1 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4 movdqa xmm1, xmm0 psrldq xmm1, 1 %assign Y 7 %rep 3 movq [r0+Y*FDEC_STRIDE], xmm0 movq [r0+(Y-1)*FDEC_STRIDE], xmm1 psrldq xmm0, 2 psrldq xmm1, 2 %assign Y (Y-2) %endrep movq [r0+1*FDEC_STRIDE], xmm0 movq [r0+0*FDEC_STRIDE], xmm1 RET ;----------------------------------------------------------------------------- ; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- cglobal predict_8x8_vl_sse2, 2,2,1 movdqa xmm4, [r1+16] movdqa xmm2, xmm4 movdqa xmm1, xmm4 movdqa xmm3, xmm4 psrldq xmm2, 1 pslldq xmm1, 1 pavgb xmm3, xmm2 PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5 ; xmm0: (t0 + 2*t1 + t2 + 2) >> 2 ; xmm3: (t0 + t1 + 1) >> 1 %assign Y 0 %rep 3 psrldq xmm0, 1 movq [r0+ Y *FDEC_STRIDE], xmm3 movq [r0+(Y+1)*FDEC_STRIDE], xmm0 psrldq xmm3, 1 %assign Y (Y+2) %endrep psrldq xmm0, 1 movq [r0+ Y *FDEC_STRIDE], xmm3 movq [r0+(Y+1)*FDEC_STRIDE], xmm0 RET ;----------------------------------------------------------------------------- ; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge ) ;----------------------------------------------------------------------------- ; fills only some pixels: ; f01234567 ; 0........ ; 1,,,,,,,, ; 2 ....... ; 3 ,,,,,,, ; 4 ...... ; 5 ,,,,,, ; 6 ..... ; 7 ,,,,, cglobal predict_8x8_vr_core_mmxext, 2,2,1 movq mm2, [r1+16] movq mm3, [r1+15] movq mm1, [r1+14] movq mm4, mm3 pavgb mm3, mm2 PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7 %assign Y 0 %rep 3 movq [r0+ Y *FDEC_STRIDE], mm3 movq [r0+(Y+1)*FDEC_STRIDE], mm0 psllq mm3, 8 psllq mm0, 8 %assign Y (Y+2) %endrep movq [r0+ Y *FDEC_STRIDE], mm3 movq [r0+(Y+1)*FDEC_STRIDE], mm0 RET ;----------------------------------------------------------------------------- ; void predict_8x8c_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_v_mmx, 1,1 movq mm0, [r0 - FDEC_STRIDE] STORE8x8 mm0, mm0 RET ;----------------------------------------------------------------------------- ; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_dc_core_mmxext, 1,1,1 movq mm0, [r0 - FDEC_STRIDE] pxor mm1, mm1 pxor mm2, mm2 punpckhbw mm1, mm0 punpcklbw mm0, mm2 psadbw mm1, mm2 ; s1 psadbw mm0, mm2 ; s0 %ifdef ARCH_X86_64 movd mm4, r1d movd mm5, r2d paddw mm0, mm4 pshufw mm2, mm5, 0 %else paddw mm0, r1m pshufw mm2, r2m, 0 %endif psrlw mm0, 3 paddw mm1, [pw_2 GLOBAL] movq mm3, mm2 pshufw mm1, mm1, 0 pshufw mm0, mm0, 0 ; dc0 (w) paddw mm3, mm1 psrlw mm3, 3 ; dc3 (w) psrlw mm2, 2 ; dc2 (w) psrlw mm1, 2 ; dc1 (w) packuswb mm0, mm1 ; dc0,dc1 (b) packuswb mm2, mm3 ; dc2,dc3 (b) STORE8x8 mm0, mm2 RET %macro LOAD_PLANE_ARGS 0 %ifdef ARCH_X86_64 movd mm0, r1d movd mm2, r2d movd mm4, r3d pshufw mm0, mm0, 0 pshufw mm2, mm2, 0 pshufw mm4, mm4, 0 %else pshufw mm0, r1m, 0 pshufw mm2, r2m, 0 pshufw mm4, r3m, 0 %endif %endmacro ;----------------------------------------------------------------------------- ; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_8x8c_p_core_mmxext, 1,2,1 LOAD_PLANE_ARGS movq mm1, mm2 pmullw mm2, [pw_3210 GLOBAL] psllw mm1, 2 paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b} paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b} mov r1d, 8 ALIGN 4 .loop: movq mm5, mm0 movq mm6, mm1 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0], mm5 paddsw mm0, mm4 paddsw mm1, mm4 add r0, FDEC_STRIDE dec r1d jg .loop REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_mmxext, 1,2,1 LOAD_PLANE_ARGS movq mm5, mm2 movq mm1, mm2 pmullw mm5, [pw_3210 GLOBAL] psllw mm2, 3 psllw mm1, 2 movq mm3, mm2 paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b} paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b} paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b} mov r1d, 16 ALIGN 4 .loop: movq mm5, mm0 movq mm6, mm1 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0], mm5 movq mm5, mm2 movq mm6, mm3 psraw mm5, 5 psraw mm6, 5 packuswb mm5, mm6 movq [r0+8], mm5 paddsw mm0, mm4 paddsw mm1, mm4 paddsw mm2, mm4 paddsw mm3, mm4 add r0, FDEC_STRIDE dec r1d jg .loop REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c ) ;----------------------------------------------------------------------------- cglobal predict_16x16_p_core_sse2, 1,2,1 movd xmm0, r1m movd xmm1, r2m movd xmm2, r3m pshuflw xmm0, xmm0, 0 pshuflw xmm1, xmm1, 0 pshuflw xmm2, xmm2, 0 punpcklqdq xmm0, xmm0 punpcklqdq xmm1, xmm1 punpcklqdq xmm2, xmm2 movdqa xmm3, xmm1 pmullw xmm3, [pw_76543210 GLOBAL] psllw xmm1, 3 paddsw xmm0, xmm3 ; xmm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b, i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b} paddsw xmm1, xmm0 ; xmm1 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b, i+12*b, i+13*b, i+14*b, i+15*b} mov r1d, 16 ALIGN 4 .loop: movdqa xmm3, xmm0 movdqa xmm4, xmm1 psraw xmm3, 5 psraw xmm4, 5 packuswb xmm3, xmm4 movdqa [r0], xmm3 paddsw xmm0, xmm2 paddsw xmm1, xmm2 add r0, FDEC_STRIDE dec r1d jg .loop REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_v_mmx( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_mmx, 1,2 movq mm0, [r0 - FDEC_STRIDE] movq mm1, [r0 - FDEC_STRIDE + 8] STORE16x16 mm0, mm1 REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_v_sse2( uint8_t *src ) ;----------------------------------------------------------------------------- cglobal predict_16x16_v_sse2, 1,2 movdqa xmm0, [r0 - FDEC_STRIDE] STORE16x16_SSE2 xmm0 REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC 2 pxor mm0, mm0 pxor mm1, mm1 psadbw mm0, [r0 - FDEC_STRIDE] psadbw mm1, [r0 - FDEC_STRIDE + 8] paddusw mm0, mm1 paddusw mm0, %1 psrlw mm0, %2 ; dc pshufw mm0, mm0, 0 packuswb mm0, mm0 ; dc in bytes STORE16x16 mm0, mm0 %endmacro cglobal predict_16x16_dc_core_mmxext, 1,2 %ifdef ARCH_X86_64 movd mm2, r1d PRED16x16_DC mm2, 5 %else PRED16x16_DC r1m, 5 %endif REP_RET cglobal predict_16x16_dc_top_mmxext, 1,2,1 PRED16x16_DC [pw_8 GLOBAL], 4 REP_RET ;----------------------------------------------------------------------------- ; void predict_16x16_dc_core_sse2( uint8_t *src, int i_dc_left ) ;----------------------------------------------------------------------------- %macro PRED16x16_DC_SSE2 2 pxor xmm0, xmm0 psadbw xmm0, [r0 - FDEC_STRIDE] movhlps xmm1, xmm0 paddw xmm0, xmm1 paddusw xmm0, %1 psrlw xmm0, %2 ; dc pshuflw xmm0, xmm0, 0 punpcklqdq xmm0, xmm0 packuswb xmm0, xmm0 ; dc in bytes STORE16x16_SSE2 xmm0 %endmacro cglobal predict_16x16_dc_core_sse2, 1,2 movd xmm2, r1m PRED16x16_DC_SSE2 xmm2, 5 REP_RET cglobal predict_16x16_dc_top_sse2, 1,2,1 PRED16x16_DC_SSE2 [pw_8 GLOBAL], 4 REP_RET