jg .loop
REP_RET
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal pred16x16_tm_vp8_8, 2, 4, 5, dst, stride, stride3, iteration
+ sub dstq, strideq
+ pmovzxbw m0, [dstq]
+ vpbroadcastb xm1, [r0-1]
+ pmovzxbw m1, xm1
+ psubw m0, m1
+ mov iterationd, 4
+ lea stride3q, [strideq*3]
+.loop:
+ vpbroadcastb xm1, [dstq+strideq*1-1]
+ vpbroadcastb xm2, [dstq+strideq*2-1]
+ vpbroadcastb xm3, [dstq+stride3q-1]
+ vpbroadcastb xm4, [dstq+strideq*4-1]
+ pmovzxbw m1, xm1
+ pmovzxbw m2, xm2
+ pmovzxbw m3, xm3
+ pmovzxbw m4, xm4
+ paddw m1, m0
+ paddw m2, m0
+ paddw m3, m0
+ paddw m4, m0
+ vpackuswb m1, m1, m2
+ vpackuswb m3, m3, m4
+ vpermq m1, m1, q3120
+ vpermq m3, m3, q3120
+ movdqa [dstq+strideq*1], xm1
+ vextracti128 [dstq+strideq*2], m1, 1
+ movdqa [dstq+stride3q*1], xm3
+ vextracti128 [dstq+strideq*4], m3, 1
+ lea dstq, [dstq+strideq*4]
+ dec iterationd
+ jg .loop
+ REP_RET
+%endif
+
;-----------------------------------------------------------------------------
- ; void ff_pred16x16_plane_*_8(uint8_t *src, int stride)
+ ; void ff_pred16x16_plane_*_8(uint8_t *src, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro H264_PRED16x16_PLANE 1
PRED4x4_DR
INIT_XMM ssse3
PRED4x4_DR
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DR
+%endif
;------------------------------------------------------------------------------
- ; void ff_pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
+ ; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright,
+ ; ptrdiff_t stride)
;------------------------------------------------------------------------------
%macro PRED4x4_VR 0
cglobal pred4x4_vertical_right_10, 3, 3, 6
PRED4x4_VR
INIT_XMM ssse3
PRED4x4_VR
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VR
+%endif
;-------------------------------------------------------------------------------
- ; void ff_pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
+ ; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright,
+ ; ptrdiff_t stride)
;-------------------------------------------------------------------------------
%macro PRED4x4_HD 0
cglobal pred4x4_horizontal_down_10, 3, 3
PRED4x4_HD
INIT_XMM ssse3
PRED4x4_HD
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_HD
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred4x4_dc(pixel *src, const pixel *topright, int stride)
+ ; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
-%macro HADDD 2 ; sum junk
-%if mmsize == 16
- movhlps %2, %1
- paddd %1, %2
- pshuflw %2, %1, 0xE
- paddd %1, %2
-%else
- pshufw %2, %1, 0xE
- paddd %1, %2
-%endif
-%endmacro
-
-%macro HADDW 2
- pmaddwd %1, [pw_1]
- HADDD %1, %2
-%endmacro
INIT_MMX mmxext
cglobal pred4x4_dc_10, 3, 3
INIT_XMM sse2
PRED4x4_DL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DL
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
+ ; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright,
+ ; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_VL 0
cglobal pred4x4_vertical_left_10, 3, 3
INIT_XMM sse2
PRED4x4_VL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VL
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
+ ; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright,
+ ; ptrdiff_t stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred4x4_horizontal_up_10, 3, 3
INIT_XMM sse2
PRED8x8L_TOP_DC
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_TOP_DC
+%endif
;-------------------------------------------------------------------------------
- ; void ff_pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
+ ; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright,
+ ; ptrdiff_t stride)
;-------------------------------------------------------------------------------
;TODO: see if scalar is faster
%macro PRED8x8L_DC 0
INIT_XMM sse2
PRED8x8L_DC
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DC
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_vertical(pixel *src, int has_topleft, int has_topright,
- ; int stride)
+ ; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright,
+ ; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL 0
cglobal pred8x8l_vertical_10, 4, 4, 6
INIT_XMM sse2
PRED8x8L_VERTICAL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright,
- ; int stride)
+ ; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft,
+ ; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL 0
cglobal pred8x8l_horizontal_10, 4, 4, 5
PRED8x8L_HORIZONTAL
INIT_XMM ssse3
PRED8x8L_HORIZONTAL
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_down_left(pixel *src, int has_topleft, int has_topright,
- ; int stride)
+ ; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright,
+ ; ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_10, 4, 4, 7
PRED8x8L_DOWN_LEFT
INIT_XMM ssse3
PRED8x8L_DOWN_LEFT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_LEFT
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_down_right(pixel *src, int has_topleft, int has_topright,
- ; int stride)
+ ; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft,
+ ; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_RIGHT 0
; standard forbids this when has_topleft is false
PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3
PRED8x8L_DOWN_RIGHT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_RIGHT
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_vertical_right(pixel *src, int has_topleft,
- ; int has_topright, int stride)
+ ; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft,
+ ; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL_RIGHT 0
; likewise with 8x8l_down_right
PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3
PRED8x8L_VERTICAL_RIGHT
+%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL_RIGHT
+%endif
;-----------------------------------------------------------------------------
- ; void ff_pred8x8l_horizontal_up(pixel *src, int has_topleft,
- ; int has_topright, int stride)
+ ; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft,
+ ; int has_topright, ptrdiff_t stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL_UP 0
cglobal pred8x8l_horizontal_up_10, 4, 4, 6