;*****************************************************************************
-;* x86-optimized AC-3 DSP utils
+;* x86-optimized AC-3 DSP functions
;* Copyright (c) 2011 Justin Ruggles
;*
;* This file is part of Libav.
pd_1: times 4 dd 1
pd_151: times 4 dd 151
+; used in ff_apply_window_int16()
+pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
+pd_16384: times 4 dd 16384
+
SECTION .text
;-----------------------------------------------------------------------------
%define LOOP_ALIGN
INIT_MMX mmx
AC3_EXPONENT_MIN
-%if HAVE_MMXEXT
+%if HAVE_MMXEXT_EXTERNAL
%define LOOP_ALIGN ALIGN 16
INIT_MMX mmxext
AC3_EXPONENT_MIN
%endmacro
INIT_MMX mmx
-%define ABS2 ABS2_MMX
AC3_MAX_MSB_ABS_INT16 or_abs
INIT_MMX mmxext
-%define ABS2 ABS2_MMXEXT
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM sse2
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM ssse3
-%define ABS2 ABS2_SSSE3
AC3_MAX_MSB_ABS_INT16 or_abs
;-----------------------------------------------------------------------------
%endif
%endmacro
-%if HAVE_AMD3DNOW_EXTERNAL
-INIT_MMX 3dnow
-cglobal ac3_extract_exponents, 3, 3, 0, exp, coef, len
- add expq, lenq
- lea coefq, [coefq+4*lenq]
- neg lenq
- movq m3, [pd_1]
- movq m4, [pd_151]
-.loop:
- movq m0, [coefq+4*lenq ]
- movq m1, [coefq+4*lenq+8]
- PABSD m0, m2
- PABSD m1, m2
- pslld m0, 1
- por m0, m3
- pi2fd m2, m0
- psrld m2, 23
- movq m0, m4
- psubd m0, m2
- pslld m1, 1
- por m1, m3
- pi2fd m2, m1
- psrld m2, 23
- movq m1, m4
- psubd m1, m2
- packssdw m0, m0
- packuswb m0, m0
- packssdw m1, m1
- packuswb m1, m1
- punpcklwd m0, m1
- movd [expq+lenq], m0
- add lenq, 4
- jl .loop
- REP_RET
-%endif
-
%macro AC3_EXTRACT_EXPONENTS 0
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
add expq, lenq
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
%endif
+
+;-----------------------------------------------------------------------------
+; void ff_apply_window_int16(int16_t *output, const int16_t *input,
+; const int16_t *window, unsigned int len)
+;-----------------------------------------------------------------------------
+
+%macro REVERSE_WORDS 1-2
+%if cpuflag(ssse3) && notcpuflag(atom)
+ pshufb %1, %2
+%elif cpuflag(sse2)
+ pshuflw %1, %1, 0x1B
+ pshufhw %1, %1, 0x1B
+ pshufd %1, %1, 0x4E
+%elif cpuflag(mmxext)
+ pshufw %1, %1, 0x1B
+%endif
+%endmacro
+
+%macro MUL16FIXED 3
+%if cpuflag(ssse3) ; dst, src, unused
+; dst = ((dst * src) + (1<<14)) >> 15
+ pmulhrsw %1, %2
+%elif cpuflag(mmxext) ; dst, src, temp
+; dst = (dst * src) >> 15
+; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
+; in from the pmullw result.
+ mova %3, %1
+ pmulhw %1, %2
+ pmullw %3, %2
+ psrlw %3, 15
+ psllw %1, 1
+ por %1, %3
+%endif
+%endmacro
+
+%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
+%if %1
+cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
+%else
+cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
+%endif
+ lea offset2q, [offsetq-mmsize]
+%if cpuflag(ssse3) && notcpuflag(atom)
+ mova m5, [pb_revwords]
+ ALIGN 16
+%elif %1
+ mova m5, [pd_16384]
+%endif
+.loop:
+%if cpuflag(ssse3)
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The ssse3 version is bit-identical.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ pmulhrsw m1, m0
+ REVERSE_WORDS m0, m5
+ pmulhrsw m0, [ inputq+offsetq ]
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m0
+%elif %1
+ ; This version expands 16-bit to 32-bit, multiplies by the window,
+ ; adds 16384 for rounding, right shifts 15, then repacks back to words to
+ ; save to the output. The window is reversed for the second half.
+ mova m3, [windowq+offset2q]
+ mova m4, [ inputq+offset2q]
+ pxor m0, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m4
+ pmaddwd m0, m1
+ paddd m0, m5
+ psrad m0, 15
+ pxor m2, m2
+ punpckhwd m2, m3
+ punpckhwd m1, m4
+ pmaddwd m2, m1
+ paddd m2, m5
+ psrad m2, 15
+ packssdw m0, m2
+ mova [outputq+offset2q], m0
+ REVERSE_WORDS m3
+ mova m4, [ inputq+offsetq]
+ pxor m0, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m4
+ pmaddwd m0, m1
+ paddd m0, m5
+ psrad m0, 15
+ pxor m2, m2
+ punpckhwd m2, m3
+ punpckhwd m1, m4
+ pmaddwd m2, m1
+ paddd m2, m5
+ psrad m2, 15
+ packssdw m0, m2
+ mova [outputq+offsetq], m0
+%else
+ ; This version does the 16x16->16 multiplication in-place without expanding
+ ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
+ ; therefore are not bit-identical to the C version.
+ mova m0, [windowq+offset2q]
+ mova m1, [ inputq+offset2q]
+ mova m2, [ inputq+offsetq ]
+ MUL16FIXED m1, m0, m3
+ REVERSE_WORDS m0
+ MUL16FIXED m2, m0, m3
+ mova [outputq+offset2q], m1
+ mova [outputq+offsetq ], m2
+%endif
+ add offsetd, mmsize
+ sub offset2d, mmsize
+ jae .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 0
+INIT_XMM sse2
+APPLY_WINDOW_INT16 0
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 1
+INIT_XMM sse2
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16 1