;* Copyright (c) 2010 Loren Merritt
;* Copyright (c) 2010 Ronald S. Bultje
;*
-;* This file is part of FFmpeg.
+;* This file is part of Libav.
;*
-;* FFmpeg is free software; you can redistribute it and/or
+;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
-;* FFmpeg is distributed in the hope that it will be useful,
+;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
-;* License along with FFmpeg; if not, write to the Free Software
-;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;* License along with Libav; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "x86inc.asm"
%endif
paddw m0, m1 ; sum of H coefficients
-%ifidn %3, h264
- pmullw m0, [pw_5]
- paddw m0, [pw_32]
- psraw m0, 6
-%elifidn %3, rv40
- pmullw m0, [pw_5]
- psraw m0, 6
-%elifidn %3, svq3
- movd r3d, m0
- movsx r3, r3w
- test r3, r3
- lea r4, [r3+3]
- cmovs r3, r4
- sar r3, 2 ; H/4
- lea r3, [r3*5] ; 5*(H/4)
- test r3, r3
- lea r4, [r3+15]
- cmovs r3, r4
- sar r3, 4 ; (5*(H/4))/16
- movd m0, r3d
-%endif
-
lea r4, [r0+r2*8-1]
lea r3, [r0+r2*4-1]
add r4, r2
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define e_reg r11
%else
%define e_reg r0
lea r5, [r5+r6*4]
movzx e_reg, byte [r3 ]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movzx r10, byte [r4+r2 ]
sub r10, e_reg
%else
movzx r4, byte [e_reg+r2 ]
movzx r6, byte [r3 ]
sub r6, r4
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
lea r6, [r10+r6*2]
lea r5, [r5+r6*2]
add r5, r6
%endif
movzx r4, byte [e_reg ]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movzx r10, byte [r3 +r2 ]
sub r10, r4
sub r5, r10
movzx r4, byte [e_reg+r1 ]
movzx r6, byte [r3 +r2*2]
sub r6, r4
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
add r6, r10
%endif
lea r5, [r5+r6*8]
lea r5, [r5+r6*4]
add r5, r6 ; sum of V coefficients
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
mov r0, r0m
%endif
movzx r3, byte [r3+r2*2 ]
lea r3, [r3+r4+1]
shl r3, 4
+
movd r1d, m0
movsx r1d, r1w
+%ifnidn %3, svq3
+%ifidn %3, h264
+ lea r1d, [r1d*5+32]
+%else ; rv40
+ lea r1d, [r1d*5]
+%endif
+ sar r1d, 6
+%else ; svq3
+ test r1d, r1d
+ lea r4d, [r1d+3]
+ cmovs r1d, r4d
+ sar r1d, 2 ; H/4
+ lea r1d, [r1d*5] ; 5*(H/4)
+ test r1d, r1d
+ lea r4d, [r1d+15]
+ cmovs r1d, r4d
+ sar r1d, 4 ; (5*(H/4))/16
+%endif
+ movd m0, r1d
+
add r1d, r5d
add r3d, r1d
shl r1d, 3
%endif
paddw m0, m1 ; sum of H coefficients
- pmullw m0, [pw_17]
- paddw m0, [pw_16]
- psraw m0, 5
-
lea r4, [r0+r2*4-1]
lea r3, [r0 -1]
add r4, r2
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define e_reg r11
%else
%define e_reg r0
sub r5, e_reg
movzx e_reg, byte [r3 ]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
movzx r10, byte [r4+r2 ]
sub r10, e_reg
sub r5, r10
movzx e_reg, byte [r3+r1 ]
movzx r6, byte [r4+r2*2 ]
sub r6, e_reg
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
add r6, r10
%endif
lea r5, [r5+r6*4]
lea r5, [r5+r6*8]
sar r5, 5
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
mov r0, r0m
%endif
shl r3, 4
movd r1d, m0
movsx r1d, r1w
+ imul r1d, 17
+ add r1d, 16
+ sar r1d, 5
+ movd m0, r1d
add r1d, r5d
sub r3d, r1d
add r1d, r1d
;-----------------------------------------------------------------------------
; void pred8x8_top_dc_mmxext(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
-%ifdef CONFIG_GPL
cglobal pred8x8_top_dc_mmxext, 2,5
sub r0, r1
movq mm0, [r0]
movq [r4+r1*1], m1
movq [r4+r1*2], m1
RET
-%endif
;-----------------------------------------------------------------------------
; void pred8x8_dc_rv40(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
; void pred8x8l_top_dc(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
-%ifdef CONFIG_GPL
%macro PRED8x8L_TOP_DC 1
cglobal pred8x8l_top_dc_%1, 4,4
sub r0, r3
sub r0, r3
lea r2, [r0+r3*2]
movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
+ test r1, r1
+ lea r1, [r0+r3]
+ cmovnz r1, r0
+ punpckhbw mm0, [r1+r3*0-8]
movq mm1, [r2+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r2, r0
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
- movq mm1, [r2]
+ movq mm1, [r1+r3*0-8]
mov r0, r2
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
- test r1, r1 ; top_left
- jnz .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
-.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
;void pred8x8l_down_left(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
+INIT_MMX
+%define PALIGNR PALIGNR_MMX
+cglobal pred8x8l_down_left_mmxext, 4,5
+ sub r0, r3
+ movq mm0, [r0-8]
+ movq mm3, [r0]
+ movq mm1, [r0+8]
+ movq mm2, mm3
+ movq mm4, mm3
+ PALIGNR mm2, mm0, 7, mm0
+ PALIGNR mm1, mm4, 1, mm4
+ test r1, r1
+ jz .fix_lt_2
+ test r2, r2
+ jz .fix_tr_1
+ jmp .do_top
+.fix_lt_2:
+ movq mm5, mm3
+ pxor mm5, mm2
+ psllq mm5, 56
+ psrlq mm5, 56
+ pxor mm2, mm5
+ test r2, r2
+ jnz .do_top
+.fix_tr_1:
+ movq mm5, mm3
+ pxor mm5, mm1
+ psrlq mm5, 56
+ psllq mm5, 56
+ pxor mm1, mm5
+ jmp .do_top
+.fix_tr_2:
+ punpckhbw mm3, mm3
+ pshufw mm1, mm3, 0xFF
+ jmp .do_topright
+.do_top:
+ PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
+ movq mm7, mm4
+ test r2, r2
+ jz .fix_tr_2
+ movq mm0, [r0+8]
+ movq mm5, mm0
+ movq mm2, mm0
+ movq mm4, mm0
+ psrlq mm5, 56
+ PALIGNR mm2, mm3, 7, mm3
+ PALIGNR mm5, mm4, 1, mm4
+ PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
+.do_topright:
+ lea r1, [r0+r3*2]
+ movq mm6, mm1
+ psrlq mm1, 56
+ movq mm4, mm1
+ lea r2, [r1+r3*2]
+ movq mm2, mm6
+ PALIGNR mm2, mm7, 1, mm0
+ movq mm3, mm6
+ PALIGNR mm3, mm7, 7, mm0
+ PALIGNR mm4, mm6, 1, mm0
+ movq mm5, mm7
+ movq mm1, mm7
+ movq mm7, mm6
+ lea r4, [r2+r3*2]
+ psllq mm1, 8
+ PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
+ PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
+ movq [r4+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r4+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r2+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r2+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r1+r3*2], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r1+r3*1], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+ movq [r0+r3*2], mm1
+ psllq mm1, 8
+ psrlq mm0, 56
+ por mm1, mm0
+ movq [r0+r3*1], mm1
+ RET
+
%macro PRED8x8L_DOWN_LEFT 1
cglobal pred8x8l_down_left_%1, 4,4
sub r0, r3
%macro PRED8x8L_VERTICAL_RIGHT 1
cglobal pred8x8l_vertical_right_%1, 4,5,7
+ ; manually spill XMM registers for Win64 because
+ ; the code here is initialized with INIT_MMX
+ WIN64_SPILL_XMM 7
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
sub r0, r3
lea r2, [r0+r3*2]
movq mm0, [r0+r3*1-8]
- punpckhbw mm0, [r0+r3*0-8]
+ test r1, r1
+ lea r1, [r0+r3]
+ cmovnz r1, r0
+ punpckhbw mm0, [r1+r3*0-8]
movq mm1, [r2+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r2, r0
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
- movq mm1, [r2]
+ movq mm1, [r1+r3*0-8]
mov r0, r2
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
- test r1, r1
- jnz .do_left
-.fix_lt_1:
- movq mm5, mm3
- pxor mm5, mm4
- psrlq mm5, 56
- psllq mm5, 48
- pxor mm1, mm5
-.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
INIT_MMX
%define PALIGNR PALIGNR_SSSE3
PRED8x8L_HORIZONTAL_DOWN ssse3
-%endif
;-----------------------------------------------------------------------------
; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
; void pred4x4_down_left_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
-%ifdef CONFIG_GPL
INIT_MMX
cglobal pred4x4_down_left_mmxext, 3,3
sub r0, r2
punpckldq m1, [r1]
movq m2, m1
movq m3, m1
- movq m4, m1
psllq m1, 8
pxor m2, m1
psrlq m2, 8
- pxor m3, m2
- PRED4x4_LOWPASS m0, m1, m3, m4, m5
+ pxor m2, m3
+ PRED4x4_LOWPASS m0, m1, m2, m3, m4
lea r1, [r0+r2*2]
psrlq m0, 8
movd [r0+r2*1], m0
cglobal pred4x4_horizontal_up_mmxext, 3,3
sub r0, r2
lea r1, [r0+r2*2]
- movq m0, [r0+r2*1-8]
- punpckhbw m0, [r0+r2*2-8]
- movq m1, [r1+r2*1-8]
- punpckhbw m1, [r1+r2*2-8]
+ movd m0, [r0+r2*1-4]
+ punpcklbw m0, [r0+r2*2-4]
+ movd m1, [r1+r2*1-4]
+ punpcklbw m1, [r1+r2*2-4]
punpckhwd m0, m1
movq m1, m0
punpckhbw m1, m1
movh m0, [r0-4] ; lt ..
punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
- movq m1, [r1+r2*2-8] ; l3
- punpckhbw m1, [r1+r2*1-8] ; l2 l3
- movq m2, [r0+r2*2-8] ; l1
- punpckhbw m2, [r0+r2*1-8] ; l0 l1
+ movd m1, [r1+r2*2-4] ; l3
+ punpcklbw m1, [r1+r2*1-4] ; l2 l3
+ movd m2, [r0+r2*2-4] ; l1
+ punpcklbw m2, [r0+r2*1-4] ; l0 l1
punpckhwd m1, m2 ; l0 l1 l2 l3
punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
movq m0, m1
movh [r0+r2*2], m5
movh [r0+r2*1], m3
RET
-%endif
+
+;-----------------------------------------------------------------------------
+; void pred4x4_vertical_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+%define PALIGNR PALIGNR_MMX
+cglobal pred4x4_vertical_right_mmxext, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movh m0, [r0] ; ........t3t2t1t0
+ movq m5, m0
+ PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
+ pavgb m5, m0
+ PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
+ movq m1, m0
+ PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
+ movq m2, m0
+ PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
+ PRED4x4_LOWPASS m3, m1, m0, m2, m4
+ movq m1, m3
+ psrlq m3, 16
+ psllq m1, 48
+ movh [r0+r2*1], m5
+ movh [r0+r2*2], m3
+ PALIGNR m5, m1, 7, m2
+ psllq m1, 8
+ movh [r1+r2*1], m5
+ PALIGNR m3, m1, 7, m1
+ movh [r1+r2*2], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void pred4x4_down_right_mmxext(uint8_t *src, const uint8_t *topright, int stride)
+;-----------------------------------------------------------------------------
+
+INIT_MMX
+%define PALIGNR PALIGNR_MMX
+cglobal pred4x4_down_right_mmxext, 3,3
+ sub r0, r2
+ lea r1, [r0+r2*2]
+ movq m1, [r1-8]
+ movq m2, [r0+r2*1-8]
+ punpckhbw m2, [r0-8]
+ movh m3, [r0]
+ punpckhwd m1, m2
+ PALIGNR m3, m1, 5, m1
+ movq m1, m3
+ PALIGNR m3, [r1+r2*1-8], 7, m4
+ movq m2, m3
+ PALIGNR m3, [r1+r2*2-8], 7, m4
+ PRED4x4_LOWPASS m0, m3, m1, m2, m4
+ movh [r1+r2*2], m0
+ psrlq m0, 8
+ movh [r1+r2*1], m0
+ psrlq m0, 8
+ movh [r0+r2*2], m0
+ psrlq m0, 8
+ movh [r0+r2*1], m0
+ RET