; used in ff_ac3_extract_exponents()
pd_1: times 4 dd 1
pd_151: times 4 dd 151
-pb_shuf_4dwb: db 0, 4, 8, 12
SECTION .text
%define LOOP_ALIGN
INIT_MMX
AC3_EXPONENT_MIN mmx
-%ifdef HAVE_MMX2
+%if HAVE_MMX2
%define PMINUB PMINUB_MMXEXT
%define LOOP_ALIGN ALIGN 16
AC3_EXPONENT_MIN mmxext
%endif
-%ifdef HAVE_SSE
+%if HAVE_SSE
INIT_XMM
AC3_EXPONENT_MIN sse2
%endif
pabsd %1, %1
%endmacro
-%ifdef HAVE_AMD3DNOW
+%if HAVE_AMD3DNOW
INIT_MMX
cglobal ac3_extract_exponents_3dnow, 3,3,0, exp, coef, len
add expq, lenq
%endif
%macro AC3_EXTRACT_EXPONENTS 1
-cglobal ac3_extract_exponents_%1, 3,3,5, exp, coef, len
+cglobal ac3_extract_exponents_%1, 3,3,4, exp, coef, len
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
-%ifidn %1, ssse3 ;
- movd m4, [pb_shuf_4dwb]
-%endif
.loop:
; move 4 32-bit coefs to xmm0
mova m0, [coefq+4*lenq]
mova m0, m3
psubd m0, m1
; move the lowest byte in each of 4 dwords to the low dword
-%ifidn %1, ssse3
- pshufb m0, m4
-%else
+ ; NOTE: We cannot just extract the low bytes with pshufb because the dword
+ ; result for 16777215 is -1 due to float inaccuracy. Using packuswb
+ ; clips this to 0, which is the correct exponent.
packssdw m0, m0
packuswb m0, m0
-%endif
movd [expq+lenq], m0
add lenq, 4
REP_RET
%endmacro
-%ifdef HAVE_SSE
+%if HAVE_SSE
INIT_XMM
%define PABSD PABSD_MMX
AC3_EXTRACT_EXPONENTS sse2
-%ifdef HAVE_SSSE3
+%if HAVE_SSSE3
%define PABSD PABSD_SSSE3
AC3_EXTRACT_EXPONENTS ssse3
%endif