+;-------------------------------------------------------------------------------------------
+; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+;-------------------------------------------------------------------------------------------
+
+%macro VP9_IADST4_1D 0
+ movq2dq xmm0, m0
+ movq2dq xmm1, m1
+ movq2dq xmm2, m2
+ movq2dq xmm3, m3
+ paddw m3, m0
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+ pmaddwd xmm1, xmm0, [pw_5283_13377]
+ pmaddwd xmm4, xmm0, [pw_9929_13377]
+ pmaddwd xmm0, [pw_15212_m13377]
+ pmaddwd xmm3, xmm2, [pw_15212_9929]
+ pmaddwd xmm2, [pw_m5283_m15212]
+ psubw m3, m2
+ paddd xmm0, xmm2
+ paddd xmm3, [pd_8192]
+ paddd xmm2, [pd_8192]
+ paddd xmm1, xmm3
+ paddd xmm0, xmm3
+ paddd xmm4, xmm2
+ psrad xmm1, 14
+ psrad xmm0, 14
+ psrad xmm4, 14
+ pmulhrsw m3, [pw_13377x2] ; out2
+ packssdw xmm0, xmm0
+ packssdw xmm1, xmm1
+ packssdw xmm4, xmm4
+ movdq2q m0, xmm0 ; out3
+ movdq2q m1, xmm1 ; out0
+ movdq2q m2, xmm4 ; out1
+ SWAP 0, 1, 2, 3
+%endmacro
+
+%macro IADST4_FN 5
+INIT_MMX %5
+cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob
+ mova m0, [blockq+ 0]
+ mova m1, [blockq+ 8]
+ mova m2, [blockq+16]
+ mova m3, [blockq+24]
+ mova m6, [pw_11585x2]
+ mova m7, [pd_8192] ; rounding
+ VP9_%2_1D
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+ VP9_%4_1D
+ pxor m4, m4 ; used for the block reset, and VP9_STORE_2X
+ mova [blockq+ 0], m4
+ mova [blockq+ 8], m4
+ mova [blockq+16], m4
+ mova [blockq+24], m4
+ VP9_IDCT4_WRITEOUT
+ RET
+%endmacro
+
+IADST4_FN idct, IDCT4, iadst, IADST4, ssse3
+IADST4_FN iadst, IADST4, idct, IDCT4, ssse3
+IADST4_FN iadst, IADST4, iadst, IADST4, ssse3
+