+cextern pb_0
+cextern pw_1
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+cextern pw_512
+cextern pw_00ff
+cextern pw_pixel_max
+cextern sw_64
+cextern pd_32
+cextern deinterleave_shufd
+
+;=============================================================================
+; implicit weighted biprediction
+;=============================================================================
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
+%if WIN64
+ DECLARE_REG_TMP 0,1,2,3,4,5,4,5
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,7,%1
+ %endmacro
+%elif UNIX64
+ DECLARE_REG_TMP 0,1,2,3,4,5,7,8
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,9,%1
+ %endmacro
+%else
+ DECLARE_REG_TMP 1,2,3,4,5,6,1,2
+ %macro AVG_START 0-1 0
+ PROLOGUE 0,7,%1
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+ mov t3, r3m
+ mov t4, r4m
+ mov t5, r5m
+ %endmacro
+%endif
+
+%macro AVG_END 0
+ lea t4, [t4+t5*2*SIZEOF_PIXEL]
+ lea t2, [t2+t3*2*SIZEOF_PIXEL]
+ lea t0, [t0+t1*2*SIZEOF_PIXEL]
+ sub eax, 2
+ jg .height_loop
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+
+%macro BIWEIGHT_MMX 2
+ movh m0, %1
+ movh m1, %2
+ punpcklwd m0, m1
+ pmaddwd m0, m3
+ paddd m0, m4
+ psrad m0, 6
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+ movzx t6d, word r6m
+ mov t7d, 64
+ sub t7d, t6d
+ shl t7d, 16
+ add t6d, t7d
+ movd m3, t6d
+ SPLATD m3, m3
+ mova m4, [pd_32]
+ pxor m5, m5
+%endmacro
+
+%else ;!HIGH_BIT_DEPTH
+%macro BIWEIGHT_MMX 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, m2
+ pmullw m1, m3
+ paddw m0, m1
+ paddw m0, m4
+ psraw m0, 6
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+ movd m2, r6m
+ SPLATW m2, m2 ; weight_dst
+ mova m3, [pw_64]
+ psubw m3, m2 ; weight_src
+ mova m4, [pw_32] ; rounding
+ pxor m5, m5
+%endmacro
+%endif ;HIGH_BIT_DEPTH
+
+%macro BIWEIGHT_SSSE3 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m1
+ pmaddubsw m0, m3
+ pmulhrsw m0, m4
+%endmacro
+
+%macro BIWEIGHT_START_SSSE3 0
+ movzx t6d, byte r6m ; FIXME x86_64
+ mov t7d, 64
+ sub t7d, t6d
+ shl t7d, 8
+ add t6d, t7d
+ mova m4, [pw_512]
+ movd xm3, t6d
+%if cpuflag(avx2)
+ vpbroadcastw m3, xm3
+%else
+ SPLATW m3, m3 ; weight_dst,src
+%endif
+%endmacro
+
+%if HIGH_BIT_DEPTH
+%macro BIWEIGHT_ROW 4
+ BIWEIGHT [%2], [%3]
+%if %4==mmsize/4
+ packssdw m0, m0
+ CLIPW m0, m5, m7
+ movh [%1], m0
+%else
+ SWAP 0, 6
+ BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
+ packssdw m6, m0
+ CLIPW m6, m5, m7
+ mova [%1], m6
+%endif
+%endmacro
+
+%else ;!HIGH_BIT_DEPTH
+%macro BIWEIGHT_ROW 4
+ BIWEIGHT [%2], [%3]
+%if %4==mmsize/2
+ packuswb m0, m0
+ movh [%1], m0
+%else
+ SWAP 0, 6
+ BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
+ packuswb m6, m0
+ mova [%1], m6
+%endif
+%endmacro
+
+%endif ;HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
+;-----------------------------------------------------------------------------
+%macro AVG_WEIGHT 1-2 0
+cglobal pixel_avg_weight_w%1
+ BIWEIGHT_START
+ AVG_START %2
+%if HIGH_BIT_DEPTH
+ mova m7, [pw_pixel_max]
+%endif
+.height_loop:
+%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
+ BIWEIGHT [t2], [t4]
+ SWAP 0, 6
+ BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
+%if HIGH_BIT_DEPTH
+ packssdw m6, m0
+ CLIPW m6, m5, m7
+%else ;!HIGH_BIT_DEPTH
+ packuswb m6, m0
+%endif ;HIGH_BIT_DEPTH
+ movlps [t0], m6
+ movhps [t0+SIZEOF_PIXEL*t1], m6
+%else
+%assign x 0
+%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
+ BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
+ BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
+%assign x x+mmsize
+%endrep
+%endif
+ AVG_END
+%endmacro
+
+%define BIWEIGHT BIWEIGHT_MMX
+%define BIWEIGHT_START BIWEIGHT_START_MMX
+INIT_MMX mmx2
+AVG_WEIGHT 4
+AVG_WEIGHT 8
+AVG_WEIGHT 16
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+AVG_WEIGHT 4, 8
+AVG_WEIGHT 8, 8
+AVG_WEIGHT 16, 8
+%else ;!HIGH_BIT_DEPTH
+INIT_XMM sse2
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
+%define BIWEIGHT BIWEIGHT_SSSE3
+%define BIWEIGHT_START BIWEIGHT_START_SSSE3
+INIT_MMX ssse3
+AVG_WEIGHT 4
+INIT_XMM ssse3
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 m0, m0, [t2+t3], 1
+ vinserti128 m1, m1, [t4+t5], 1
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], m0, 1
+ AVG_END
+%endif ;HIGH_BIT_DEPTH
+
+;=============================================================================
+; P frame explicit weighted prediction
+;=============================================================================
+
+%if HIGH_BIT_DEPTH
+; width
+%macro WEIGHT_START 1
+ mova m0, [r4+ 0] ; 1<<denom
+ mova m3, [r4+16]
+ movd m2, [r4+32] ; denom
+ mova m4, [pw_pixel_max]
+ paddw m2, [sq_1] ; denom+1
+%endmacro
+
+; src1, src2
+%macro WEIGHT 2
+ movh m5, [%1]
+ movh m6, [%2]
+ punpcklwd m5, m0
+ punpcklwd m6, m0
+ pmaddwd m5, m3
+ pmaddwd m6, m3
+ psrad m5, m2
+ psrad m6, m2
+ packssdw m5, m6
+%endmacro
+
+; src, dst, width
+%macro WEIGHT_TWO_ROW 4
+ %assign x 0
+%rep (%3+mmsize/2-1)/(mmsize/2)
+%if %3-x/2 <= 4 && mmsize == 16
+ WEIGHT %1+x, %1+r3+x
+ CLIPW m5, [pb_0], m4
+ movh [%2+x], m5
+ movhps [%2+r1+x], m5
+%else
+ WEIGHT %1+x, %1+x+mmsize/2
+ SWAP 5, 7
+ WEIGHT %1+r3+x, %1+r3+x+mmsize/2
+ CLIPW m5, [pb_0], m4
+ CLIPW m7, [pb_0], m4
+ mova [%2+x], m7
+ mova [%2+r1+x], m5
+%endif
+ %assign x x+mmsize
+%endrep
+%endmacro
+
+%else ; !HIGH_BIT_DEPTH
+
+%macro WEIGHT_START 1
+%if cpuflag(avx2)
+ vbroadcasti128 m3, [r4]
+ vbroadcasti128 m4, [r4+16]
+%else
+ mova m3, [r4]
+ mova m4, [r4+16]
+%if notcpuflag(ssse3)
+ movd m5, [r4+32]
+%endif
+%endif
+ pxor m2, m2
+%endmacro
+
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
+ movh m0, [%1 ]
+ movh m1, [%1+mmsize/2]
+ movh m6, [%2 ]
+ movh m7, [%2+mmsize/2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ punpcklbw m6, m2
+ punpcklbw m7, m2
+%if cpuflag(ssse3)
+%if %5==0
+ psllw m0, 7
+ psllw m1, 7
+ psllw m6, 7
+ psllw m7, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ pmulhrsw m7, m3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m6, m4
+ paddw m7, m4
+%else
+ pmullw m0, m3
+ pmullw m1, m3
+ pmullw m6, m3
+ pmullw m7, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ paddsw m6, m4
+ paddsw m7, m4
+ psraw m0, m5
+ psraw m1, m5
+ psraw m6, m5
+ psraw m7, m5
+%endif
+ packuswb m0, m1
+ packuswb m6, m7
+ mova [%3], m0
+ mova [%4], m6
+%endmacro
+
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+ movu xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m0, m2
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ mova [%3], xm0
+ vextracti128 [%4], m0, 1
+%else
+ movq xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpcklbw m0, m2
+%if %6==0
+ psllw m0, 7
+%endif
+ pmulhrsw m0, m3
+ paddw m0, m4
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+%if %5 == 8
+ movq [%3], xm0
+ movq [%4], xm1
+%else
+ movd [%3], xm0
+ movd [%4], xm1
+%endif
+%endif
+%else
+ movh m0, [%1]
+ movh m1, [%2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+%if cpuflag(ssse3)
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+%else
+ pmullw m0, m3
+ pmullw m1, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ psraw m0, m5
+ psraw m1, m5
+%endif
+%if %5 == 8
+ packuswb m0, m1
+ movh [%3], m0
+ movhps [%4], m0
+%else
+ packuswb m0, m0
+ packuswb m1, m1
+ movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
+ movd [%4], m1
+%endif
+%endif
+%endmacro
+; src, dst, width
+%macro WEIGHT_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+ WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
+ %assign x (x+mmsize)
+%else
+ %assign w %3-x
+%if w == 20
+ %assign w 16
+%endif
+ WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
+ %assign x (x+w)
+%endif
+%if x >= %3
+ %exitrep
+%endif
+%endrep
+%endmacro
+
+%endif ; HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
+;-----------------------------------------------------------------------------
+
+%macro WEIGHTER 1
+cglobal mc_weight_w%1, 6,6,8
+ FIX_STRIDES r1, r3
+ WEIGHT_START %1
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+ ; we can merge the shift step into the scale factor
+ ; if (m3<<7) doesn't overflow an int16_t
+ cmp byte [r4+1], 0
+ jz .fast
+%endif
+.loop:
+ WEIGHT_TWO_ROW r2, r0, %1, 0
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .loop
+ RET
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+.fast:
+ psllw m3, 7
+.fastloop:
+ WEIGHT_TWO_ROW r2, r0, %1, 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .fastloop
+ RET
+%endif
+%endmacro
+
+INIT_MMX mmx2
+WEIGHTER 4
+WEIGHTER 8
+WEIGHTER 12
+WEIGHTER 16
+WEIGHTER 20
+INIT_XMM sse2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+%if HIGH_BIT_DEPTH
+WEIGHTER 12
+%else
+INIT_MMX ssse3
+WEIGHTER 4
+INIT_XMM ssse3
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+INIT_YMM avx2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+%endif
+
+%macro OFFSET_OP 7
+ mov%6 m0, [%1]
+ mov%6 m1, [%2]
+%if HIGH_BIT_DEPTH
+ p%5usw m0, m2
+ p%5usw m1, m2
+%ifidn %5,add
+ pminsw m0, m3
+ pminsw m1, m3
+%endif
+%else
+ p%5usb m0, m2
+ p%5usb m1, m2
+%endif
+ mov%7 [%3], m0
+ mov%7 [%4], m1
+%endmacro
+
+%macro OFFSET_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3*SIZEOF_PIXEL-x) >= mmsize
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
+ %assign x (x+mmsize)
+%else
+%if HIGH_BIT_DEPTH
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
+%else
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+%endif
+ %exitrep
+%endif
+%if x >= %3*SIZEOF_PIXEL
+ %exitrep
+%endif
+%endrep
+%endmacro
+
+;-----------------------------------------------------------------------------
+;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
+;-----------------------------------------------------------------------------
+%macro OFFSET 2
+cglobal mc_offset%2_w%1, 6,6
+ FIX_STRIDES r1, r3
+ mova m2, [r4]
+%if HIGH_BIT_DEPTH
+%ifidn %2,add
+ mova m3, [pw_pixel_max]
+%endif
+%endif
+.loop:
+ OFFSET_TWO_ROW r2, r0, %1, %2
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .loop
+ RET
+%endmacro
+
+%macro OFFSETPN 1
+ OFFSET %1, add
+ OFFSET %1, sub
+%endmacro
+INIT_MMX mmx2
+OFFSETPN 4
+OFFSETPN 8
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+INIT_XMM sse2
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+OFFSETPN 8
+%endif
+
+