%endmacro
%macro VP9_IDCT16_1D 2-3 16 ; src, pass, nnzc
- VP9_IDCT16_1D_START %1, %3, 32, rsp+32
+ VP9_IDCT16_1D_START %1, %3, 32, tmpq+32
%if %2 == 1
; backup a different register
- mova [rsp+16], m15
- mova m7, [rsp+32]
+ mova [tmpq+16], m15
+ mova m7, [tmpq+32]
SUMSUB_BA w, 6, 9, 15 ; t6, t9
SUMSUB_BA w, 7, 8, 15 ; t7, t8
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 15
- mova [rsp+ 0], m0
- mova [rsp+ 32], m1
- mova [rsp+ 64], m2
- mova [rsp+ 96], m3
- mova [rsp+128], m4
- mova [rsp+160], m5
- mova [rsp+192], m6
- mova [rsp+224], m7
-
- mova m15, [rsp+16]
+ mova [tmpq+ 0], m0
+ mova [tmpq+ 32], m1
+ mova [tmpq+ 64], m2
+ mova [tmpq+ 96], m3
+ mova [tmpq+128], m4
+ mova [tmpq+160], m5
+ mova [tmpq+192], m6
+ mova [tmpq+224], m7
+
+ mova m15, [tmpq+16]
TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
- mova [rsp+ 16], m8
- mova [rsp+ 48], m9
- mova [rsp+ 80], m10
- mova [rsp+112], m11
- mova [rsp+144], m12
- mova [rsp+176], m13
- mova [rsp+208], m14
- mova [rsp+240], m15
+ mova [tmpq+ 16], m8
+ mova [tmpq+ 48], m9
+ mova [tmpq+ 80], m10
+ mova [tmpq+112], m11
+ mova [tmpq+144], m12
+ mova [tmpq+176], m13
+ mova [tmpq+208], m14
+ mova [tmpq+240], m15
%else ; %2 == 2
; backup more registers
- mova [rsp+64], m8
- mova [rsp+96], m9
+ mova [tmpq+64], m8
+ mova [tmpq+96], m9
pxor m7, m7
pmulhrsw m0, [pw_512]
; restore from cache
SWAP 0, 7 ; move zero from m7 to m0
- mova m7, [rsp+32]
- mova m8, [rsp+64]
- mova m9, [rsp+96]
+ mova m7, [tmpq+32]
+ mova m8, [tmpq+64]
+ mova m9, [tmpq+96]
SUMSUB_BA w, 6, 9, 1 ; t6, t9
SUMSUB_BA w, 7, 8, 1 ; t7, t8
%macro VP9_IDCT_IDCT_16x16_ADD_XMM 1
INIT_XMM %1
-cglobal vp9_idct_idct_16x16_add, 4, 5, 16, 512, dst, stride, block, eob
+cglobal vp9_idct_idct_16x16_add, 4, 6, 16, 512, dst, stride, block, eob
; 2x2=eob=3, 4x4=eob=10
cmp eobd, 38
jg .idctfull
VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5
RET
+ DEFINE_ARGS dst, stride, block, cnt, dst_bak, tmp
.idct8x8:
- DEFINE_ARGS dst, stride, block, cnt, dst_bak
+ mov tmpq, rsp
VP9_IDCT16_1D blockq, 1, 8
mov cntd, 2
mov dst_bakq, dstq
.loop2_8x8:
- VP9_IDCT16_1D rsp, 2, 8
+ VP9_IDCT16_1D tmpq, 2, 8
lea dstq, [dst_bakq+8]
- add rsp, 16
+ add tmpq, 16
dec cntd
jg .loop2_8x8
- sub rsp, 32
; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients
RET
.idctfull:
- DEFINE_ARGS dst, stride, block, cnt, dst_bak
mov cntd, 2
+ mov tmpq, rsp
.loop1_full:
VP9_IDCT16_1D blockq, 1
add blockq, 16
- add rsp, 256
+ add tmpq, 256
dec cntd
jg .loop1_full
sub blockq, 32
- sub rsp, 512
mov cntd, 2
+ mov tmpq, rsp
mov dst_bakq, dstq
.loop2_full:
- VP9_IDCT16_1D rsp, 2
+ VP9_IDCT16_1D tmpq, 2
lea dstq, [dst_bakq+8]
- add rsp, 16
+ add tmpq, 16
dec cntd
jg .loop2_full
- sub rsp, 32
; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients
VP9_RND_SH_SUMSUB_BA 2, 11, 5, 7, 12, [pd_8192] ; m2=t2[w], m11=t10[w]
VP9_RND_SH_SUMSUB_BA 3, 10, 4, 6, 12, [pd_8192] ; m3=t3[w], m10=t11[w]
- mova [rsp+ 0*%%str], m9 ; make some scratch space (t0:m9->r0)
+ mova [tmpq+ 0*%%str], m9 ; make some scratch space (t0:m9->r0)
mova m4, [%1+ 4*32] ; in4
mova m5, [%1+11*32] ; in11
mova m12, [%1+ 3*32] ; in3
VP9_RND_SH_SUMSUB_BA 13, 4, 15, 6, 9, [pd_8192] ; m13=t4[w], m4=t12[w]
VP9_RND_SH_SUMSUB_BA 12, 5, 14, 7, 9, [pd_8192] ; m12=t5[w], m5=t13[w]
- mova [rsp+ 2*%%str], m8 ; t1:m9->r2
- mova [rsp+ 3*%%str], m2 ; t2:m2->r3
- mova [rsp+ 4*%%str], m3 ; t3:m3->r4
- mova [rsp+ 5*%%str], m13 ; t4:m13->r5
+ mova [tmpq+ 2*%%str], m8 ; t1:m9->r2
+ mova [tmpq+ 3*%%str], m2 ; t2:m2->r3
+ mova [tmpq+ 4*%%str], m3 ; t3:m3->r4
+ mova [tmpq+ 5*%%str], m13 ; t4:m13->r5
mova m2, [%1+ 6*32] ; in6
mova m3, [%1+ 9*32] ; in9
mova m8, [%1+ 1*32] ; in1
; m3=out1, m11=out2, m1=out5, m5=out6, m4=out9, m0=out10, m10=out13, m2=out14
- mova m6, [rsp+ 0*%%str]
- mova m7, [rsp+ 2*%%str]
- mova m13, [rsp+ 3*%%str]
- mova m14, [rsp+ 4*%%str]
- mova m15, [rsp+ 5*%%str]
- mova [rsp+ 8*%%str], m5
- mova [rsp+ 9*%%str], m4
- mova [rsp+10*%%str], m0
- mova [rsp+11*%%str], m10
- mova [rsp+12*%%str], m2
+ mova m6, [tmpq+ 0*%%str]
+ mova m7, [tmpq+ 2*%%str]
+ mova m13, [tmpq+ 3*%%str]
+ mova m14, [tmpq+ 4*%%str]
+ mova m15, [tmpq+ 5*%%str]
+ mova [tmpq+ 8*%%str], m5
+ mova [tmpq+ 9*%%str], m4
+ mova [tmpq+10*%%str], m0
+ mova [tmpq+11*%%str], m10
+ mova [tmpq+12*%%str], m2
; m6=t0, m7=t1, m13=t2, m14=t3, m15=t4, m12=t5, m9=t6, m8=t7
; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
; m3=out1, m11=out2, m1=out5, r8=out6, r9=out9, r10=out10, r11=out13, r12=out14
%if %2 == 1
- mova m0, [rsp+ 8*%%str]
+ mova m0, [tmpq+ 8*%%str]
TRANSPOSE8x8W 9, 3, 11, 14, 7, 1, 0, 12, 2
- mova [rsp+ 0*16], m9
- mova [rsp+ 2*16], m3
- mova [rsp+ 4*16], m11
- mova [rsp+ 6*16], m14
- mova m9, [rsp+ 9*%%str]
- mova m3, [rsp+10*%%str]
- mova m11, [rsp+11*%%str]
- mova m14, [rsp+12*%%str]
- mova [rsp+ 8*16], m7
- mova [rsp+10*16], m1
- mova [rsp+12*16], m0
- mova [rsp+14*16], m12
+ mova [tmpq+ 0*16], m9
+ mova [tmpq+ 2*16], m3
+ mova [tmpq+ 4*16], m11
+ mova [tmpq+ 6*16], m14
+ mova m9, [tmpq+ 9*%%str]
+ mova m3, [tmpq+10*%%str]
+ mova m11, [tmpq+11*%%str]
+ mova m14, [tmpq+12*%%str]
+ mova [tmpq+ 8*16], m7
+ mova [tmpq+10*16], m1
+ mova [tmpq+12*16], m0
+ mova [tmpq+14*16], m12
TRANSPOSE8x8W 15, 9, 3, 6, 13, 11, 14, 8, 2
- mova [rsp+ 1*16], m15
- mova [rsp+ 3*16], m9
- mova [rsp+ 5*16], m3
- mova [rsp+ 7*16], m6
- mova [rsp+ 9*16], m13
- mova [rsp+11*16], m11
- mova [rsp+13*16], m14
- mova [rsp+15*16], m8
+ mova [tmpq+ 1*16], m15
+ mova [tmpq+ 3*16], m9
+ mova [tmpq+ 5*16], m3
+ mova [tmpq+ 7*16], m6
+ mova [tmpq+ 9*16], m13
+ mova [tmpq+11*16], m11
+ mova [tmpq+13*16], m14
+ mova [tmpq+15*16], m8
%else
- mova m5, [rsp+ 8*%%str]
+ mova m5, [tmpq+ 8*%%str]
pxor m0, m0
pmulhrsw m9, [pw_512]
VP9_STORE_2X 5, 12, 2, 4, 0
lea dstq, [dstq+strideq*2]
- mova m9, [rsp+ 9*%%str]
- mova m3, [rsp+10*%%str]
- mova m11, [rsp+11*%%str]
- mova m14, [rsp+12*%%str]
+ mova m9, [tmpq+ 9*%%str]
+ mova m3, [tmpq+10*%%str]
+ mova m11, [tmpq+11*%%str]
+ mova m14, [tmpq+12*%%str]
pmulhrsw m15, [pw_512]
pmulhrsw m9, [pw_512]
%macro IADST16_FN 5
INIT_XMM %5
-cglobal vp9_%1_%3_16x16_add, 3, 5, 16, 512, dst, stride, block, eob
- ; potential eob checks go here
-
- DEFINE_ARGS dst, stride, block, cnt, dst_bak
+cglobal vp9_%1_%3_16x16_add, 3, 6, 16, 512, dst, stride, block, cnt, dst_bak, tmp
mov cntd, 2
+ mov tmpq, rsp
.loop1_full:
VP9_%2_1D blockq, 1
add blockq, 16
- add rsp, 256
+ add tmpq, 256
dec cntd
jg .loop1_full
sub blockq, 32
- sub rsp, 512
mov cntd, 2
+ mov tmpq, rsp
mov dst_bakq, dstq
.loop2_full:
- VP9_%4_1D rsp, 2
+ VP9_%4_1D tmpq, 2
lea dstq, [dst_bakq+8]
- add rsp, 16
+ add tmpq, 16
dec cntd
jg .loop2_full
- sub rsp, 32
; at the end of the loop, m0 should still be zero
; use that to zero out block coefficients
%macro VP9_IDCT32_1D 2-3 32 ; src, pass, nnzc
%assign %%str 16*%2*%2
; first do t0-15, this can be done identical to idct16x16
- VP9_IDCT16_1D_START %1, %3/2, 64*2, rsp+ 4*%%str
+ VP9_IDCT16_1D_START %1, %3/2, 64*2, tmpq+ 4*%%str
; backup a different register
- mova [rsp+30*%%str], m15 ; t15
- mova m7, [rsp+ 4*%%str]
+ mova [tmpq+30*%%str], m15 ; t15
+ mova m7, [tmpq+ 4*%%str]
SUMSUB_BA w, 6, 9, 15 ; t6, t9
SUMSUB_BA w, 7, 8, 15 ; t7, t8
; store everything on stack to make space available for t16-31
; we store interleaved with the output of the second half (t16-31)
; so we don't need to allocate extra stack space
- mova [rsp+ 0*%%str], m0 ; t0
- mova [rsp+ 4*%%str], m1 ; t1
- mova [rsp+ 8*%%str], m2 ; t2
- mova [rsp+12*%%str], m3 ; t3
- mova [rsp+16*%%str], m4 ; t4
- mova [rsp+20*%%str], m5 ; t5
- mova [rsp+24*%%str], m6 ; t6
- mova [rsp+28*%%str], m7 ; t7
- mova [rsp+ 2*%%str], m8 ; t8
- mova [rsp+ 6*%%str], m9 ; t9
- mova [rsp+10*%%str], m10 ; t10
- mova [rsp+14*%%str], m11 ; t11
- mova [rsp+18*%%str], m12 ; t12
- mova [rsp+22*%%str], m13 ; t13
- mova [rsp+26*%%str], m14 ; t14
+ mova [tmpq+ 0*%%str], m0 ; t0
+ mova [tmpq+ 4*%%str], m1 ; t1
+ mova [tmpq+ 8*%%str], m2 ; t2
+ mova [tmpq+12*%%str], m3 ; t3
+ mova [tmpq+16*%%str], m4 ; t4
+ mova [tmpq+20*%%str], m5 ; t5
+ mova [tmpq+24*%%str], m6 ; t6
+ mova [tmpq+28*%%str], m7 ; t7
+ mova [tmpq+ 2*%%str], m8 ; t8
+ mova [tmpq+ 6*%%str], m9 ; t9
+ mova [tmpq+10*%%str], m10 ; t10
+ mova [tmpq+14*%%str], m11 ; t11
+ mova [tmpq+18*%%str], m12 ; t12
+ mova [tmpq+22*%%str], m13 ; t13
+ mova [tmpq+26*%%str], m14 ; t14
; then, secondly, do t16-31
%if %3 <= 8
; from 1 stage forward
SUMSUB_BA w, 8, 4, 1
; temporary storage
- mova [rsp+17*%%str], m8 ; t16
- mova [rsp+21*%%str], m4 ; t19
+ mova [tmpq+17*%%str], m8 ; t16
+ mova [tmpq+21*%%str], m4 ; t19
VP9_UNPACK_MULSUB_2W_4X 1, 14, 15, 0, 9102, 13623, [pd_8192], 4, 8 ; t21, t26
VP9_UNPACK_MULSUB_2W_4X 13, 2, 3, 12, 13623, m9102, [pd_8192], 4, 8 ; t22, t25
; from 2 stages forward
SUMSUB_BA w, 8, 4, 2
; temporary storage
- mova [rsp+17*%%str], m8 ; t16
- mova [rsp+21*%%str], m4 ; t19
+ mova [tmpq+17*%%str], m8 ; t16
+ mova [tmpq+21*%%str], m4 ; t19
%if %3 <= 16
pmulhrsw m3, m12, [pw_13160x2]
pmulhrsw m12, [pw_9760x2]
; m8[s]=t16, m9=t17, m5=t18, m4[s]=t19, m12=t20, m13=t21, m1=t22, m0=t23,
; m15=t24, m14=t25, m2=t26, m3=t27, m11=t28, m10=t29, m6=t30, m7=t31
- mova m8, [rsp+17*%%str] ; t16
+ mova m8, [tmpq+17*%%str] ; t16
; from 2 stages forward
SUMSUB_BA w, 0, 8, 4
SUMSUB_BA w, 15, 7, 4
pmulhrsw m7, [pw_11585x2]
pmulhrsw m8, [pw_11585x2]
; store t16/t23
- mova [rsp+ 1*%%str], m0 ; t16
- mova [rsp+29*%%str], m7 ; t23
+ mova [tmpq+ 1*%%str], m0 ; t16
+ mova [tmpq+29*%%str], m7 ; t23
- mova m4, [rsp+21*%%str] ; t19
+ mova m4, [tmpq+21*%%str] ; t19
VP9_UNPACK_MULSUB_2W_4X 10, 5, 15137, 6270, [pd_8192], 0, 7 ; t18, t29
VP9_UNPACK_MULSUB_2W_4X 11, 4, 15137, 6270, [pd_8192], 0, 7 ; t19, t28
VP9_UNPACK_MULSUB_2W_4X 3, 12, 6270, m15137, [pd_8192], 0, 7 ; t20, t27
; store t17-19 (and t20-22 for pass 1) - keep t24-31 in registers for
; final sumsub in pass 1, or keep t20-22 and t24-31 in registers for
; final sumsub of pass 2
- mova [rsp+ 5*%%str], m1 ; t17
- mova [rsp+ 9*%%str], m2 ; t18
- mova [rsp+13*%%str], m3 ; t19
+ mova [tmpq+ 5*%%str], m1 ; t17
+ mova [tmpq+ 9*%%str], m2 ; t18
+ mova [tmpq+13*%%str], m3 ; t19
; then do final pass to sumsub+store the two halves
%if %2 == 1
- mova [rsp+17*%%str], m4 ; t20
- mova [rsp+21*%%str], m5 ; t21
- mova [rsp+25*%%str], m6 ; t22
-
- mova m0, [rsp+ 0*%%str] ; t0
- mova m1, [rsp+ 4*%%str] ; t1
- mova m2, [rsp+ 8*%%str] ; t2
- mova m3, [rsp+12*%%str] ; t3
- mova m4, [rsp+16*%%str] ; t4
- mova m5, [rsp+20*%%str] ; t5
- mova m6, [rsp+24*%%str] ; t6
+ mova [tmpq+17*%%str], m4 ; t20
+ mova [tmpq+21*%%str], m5 ; t21
+ mova [tmpq+25*%%str], m6 ; t22
+
+ mova m0, [tmpq+ 0*%%str] ; t0
+ mova m1, [tmpq+ 4*%%str] ; t1
+ mova m2, [tmpq+ 8*%%str] ; t2
+ mova m3, [tmpq+12*%%str] ; t3
+ mova m4, [tmpq+16*%%str] ; t4
+ mova m5, [tmpq+20*%%str] ; t5
+ mova m6, [tmpq+24*%%str] ; t6
SUMSUB_BA w, 15, 0, 7
- mova [rsp+ 3*%%str], m0 ; t15
- mova m7, [rsp+28*%%str] ; t7
+ mova [tmpq+ 3*%%str], m0 ; t15
+ mova m7, [tmpq+28*%%str] ; t7
SUMSUB_BA w, 14, 1, 0
SUMSUB_BA w, 13, 2, 0
SUMSUB_BA w, 12, 3, 0
SUMSUB_BA w, 8, 7, 0
TRANSPOSE8x8W 15, 14, 13, 12, 11, 10, 9, 8, 0
- mova [rsp+ 0*%%str], m15
- mova [rsp+ 4*%%str], m14
- mova [rsp+ 8*%%str], m13
- mova [rsp+12*%%str], m12
- mova [rsp+16*%%str], m11
- mova [rsp+20*%%str], m10
- mova [rsp+24*%%str], m9
- mova [rsp+28*%%str], m8
-
- mova m0, [rsp+ 3*%%str] ; t15
+ mova [tmpq+ 0*%%str], m15
+ mova [tmpq+ 4*%%str], m14
+ mova [tmpq+ 8*%%str], m13
+ mova [tmpq+12*%%str], m12
+ mova [tmpq+16*%%str], m11
+ mova [tmpq+20*%%str], m10
+ mova [tmpq+24*%%str], m9
+ mova [tmpq+28*%%str], m8
+
+ mova m0, [tmpq+ 3*%%str] ; t15
TRANSPOSE8x8W 7, 6, 5, 4, 3, 2, 1, 0, 8
- mova [rsp+ 3*%%str], m7
- mova [rsp+ 7*%%str], m6
- mova [rsp+11*%%str], m5
- mova [rsp+15*%%str], m4
- mova [rsp+19*%%str], m3
- mova [rsp+23*%%str], m2
- mova [rsp+27*%%str], m1
- mova [rsp+31*%%str], m0
-
- mova m15, [rsp+ 2*%%str] ; t8
- mova m14, [rsp+ 6*%%str] ; t9
- mova m13, [rsp+10*%%str] ; t10
- mova m12, [rsp+14*%%str] ; t11
- mova m11, [rsp+18*%%str] ; t12
- mova m10, [rsp+22*%%str] ; t13
- mova m9, [rsp+26*%%str] ; t14
- mova m8, [rsp+30*%%str] ; t15
- mova m7, [rsp+ 1*%%str] ; t16
- mova m6, [rsp+ 5*%%str] ; t17
- mova m5, [rsp+ 9*%%str] ; t18
- mova m4, [rsp+13*%%str] ; t19
- mova m3, [rsp+17*%%str] ; t20
- mova m2, [rsp+21*%%str] ; t21
- mova m1, [rsp+25*%%str] ; t22
+ mova [tmpq+ 3*%%str], m7
+ mova [tmpq+ 7*%%str], m6
+ mova [tmpq+11*%%str], m5
+ mova [tmpq+15*%%str], m4
+ mova [tmpq+19*%%str], m3
+ mova [tmpq+23*%%str], m2
+ mova [tmpq+27*%%str], m1
+ mova [tmpq+31*%%str], m0
+
+ mova m15, [tmpq+ 2*%%str] ; t8
+ mova m14, [tmpq+ 6*%%str] ; t9
+ mova m13, [tmpq+10*%%str] ; t10
+ mova m12, [tmpq+14*%%str] ; t11
+ mova m11, [tmpq+18*%%str] ; t12
+ mova m10, [tmpq+22*%%str] ; t13
+ mova m9, [tmpq+26*%%str] ; t14
+ mova m8, [tmpq+30*%%str] ; t15
+ mova m7, [tmpq+ 1*%%str] ; t16
+ mova m6, [tmpq+ 5*%%str] ; t17
+ mova m5, [tmpq+ 9*%%str] ; t18
+ mova m4, [tmpq+13*%%str] ; t19
+ mova m3, [tmpq+17*%%str] ; t20
+ mova m2, [tmpq+21*%%str] ; t21
+ mova m1, [tmpq+25*%%str] ; t22
SUMSUB_BA w, 7, 8, 0
- mova [rsp+ 2*%%str], m8
- mova m0, [rsp+29*%%str] ; t23
+ mova [tmpq+ 2*%%str], m8
+ mova m0, [tmpq+29*%%str] ; t23
SUMSUB_BA w, 6, 9, 8
SUMSUB_BA w, 5, 10, 8
SUMSUB_BA w, 4, 11, 8
SUMSUB_BA w, 0, 15, 8
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
- mova [rsp+ 1*%%str], m0
- mova [rsp+ 5*%%str], m1
- mova [rsp+ 9*%%str], m2
- mova [rsp+13*%%str], m3
- mova [rsp+17*%%str], m4
- mova [rsp+21*%%str], m5
- mova [rsp+25*%%str], m6
- mova [rsp+29*%%str], m7
-
- mova m8, [rsp+ 2*%%str]
+ mova [tmpq+ 1*%%str], m0
+ mova [tmpq+ 5*%%str], m1
+ mova [tmpq+ 9*%%str], m2
+ mova [tmpq+13*%%str], m3
+ mova [tmpq+17*%%str], m4
+ mova [tmpq+21*%%str], m5
+ mova [tmpq+25*%%str], m6
+ mova [tmpq+29*%%str], m7
+
+ mova m8, [tmpq+ 2*%%str]
TRANSPOSE8x8W 8, 9, 10, 11, 12, 13, 14, 15, 0
- mova [rsp+ 2*%%str], m8
- mova [rsp+ 6*%%str], m9
- mova [rsp+10*%%str], m10
- mova [rsp+14*%%str], m11
- mova [rsp+18*%%str], m12
- mova [rsp+22*%%str], m13
- mova [rsp+26*%%str], m14
- mova [rsp+30*%%str], m15
+ mova [tmpq+ 2*%%str], m8
+ mova [tmpq+ 6*%%str], m9
+ mova [tmpq+10*%%str], m10
+ mova [tmpq+14*%%str], m11
+ mova [tmpq+18*%%str], m12
+ mova [tmpq+22*%%str], m13
+ mova [tmpq+26*%%str], m14
+ mova [tmpq+30*%%str], m15
%else
- ; t0-7 is in [rsp+{0,4,8,12,16,20,24,28}*%%str]
- ; t8-15 is in [rsp+{2,6,10,14,18,22,26,30}*%%str]
- ; t16-19 and t23 is in [rsp+{1,5,9,13,29}*%%str]
+ ; t0-7 is in [tmpq+{0,4,8,12,16,20,24,28}*%%str]
+ ; t8-15 is in [tmpq+{2,6,10,14,18,22,26,30}*%%str]
+ ; t16-19 and t23 is in [tmpq+{1,5,9,13,29}*%%str]
; t20-22 is in m4-6
; t24-31 is in m8-15
pxor m7, m7
%endmacro
; store t0-1 and t30-31
- mova m0, [rsp+ 0*%%str]
- mova m1, [rsp+ 4*%%str]
+ mova m0, [tmpq+ 0*%%str]
+ mova m1, [tmpq+ 4*%%str]
%%STORE_2X2 0, 1, 14, 15, 2, 3, 7
; store t2-3 and t28-29
- mova m0, [rsp+ 8*%%str]
- mova m1, [rsp+12*%%str]
+ mova m0, [tmpq+ 8*%%str]
+ mova m1, [tmpq+12*%%str]
%%STORE_2X2 0, 1, 12, 13, 2, 3, 7
; store t4-5 and t26-27
- mova m0, [rsp+16*%%str]
- mova m1, [rsp+20*%%str]
+ mova m0, [tmpq+16*%%str]
+ mova m1, [tmpq+20*%%str]
%%STORE_2X2 0, 1, 10, 11, 2, 3, 7
; store t6-7 and t24-25
- mova m0, [rsp+24*%%str]
- mova m1, [rsp+28*%%str]
+ mova m0, [tmpq+24*%%str]
+ mova m1, [tmpq+28*%%str]
%%STORE_2X2 0, 1, 8, 9, 2, 3, 7
; store t8-9 and t22-23
- mova m0, [rsp+ 2*%%str]
- mova m1, [rsp+ 6*%%str]
- mova m8, [rsp+29*%%str]
+ mova m0, [tmpq+ 2*%%str]
+ mova m1, [tmpq+ 6*%%str]
+ mova m8, [tmpq+29*%%str]
%%STORE_2X2 0, 1, 6, 8, 2, 3, 7
; store t10-11 and t20-21
- mova m0, [rsp+10*%%str]
- mova m1, [rsp+14*%%str]
+ mova m0, [tmpq+10*%%str]
+ mova m1, [tmpq+14*%%str]
%%STORE_2X2 0, 1, 4, 5, 2, 3, 7
; store t12-13 and t18-19
- mova m0, [rsp+18*%%str]
- mova m1, [rsp+22*%%str]
- mova m5, [rsp+13*%%str]
- mova m4, [rsp+ 9*%%str]
+ mova m0, [tmpq+18*%%str]
+ mova m1, [tmpq+22*%%str]
+ mova m5, [tmpq+13*%%str]
+ mova m4, [tmpq+ 9*%%str]
%%STORE_2X2 0, 1, 4, 5, 2, 3, 7
; store t14-17
- mova m0, [rsp+26*%%str]
- mova m1, [rsp+30*%%str]
- mova m5, [rsp+ 5*%%str]
- mova m4, [rsp+ 1*%%str]
+ mova m0, [tmpq+26*%%str]
+ mova m1, [tmpq+30*%%str]
+ mova m5, [tmpq+ 5*%%str]
+ mova m4, [tmpq+ 1*%%str]
%%STORE_2X2 0, 1, 4, 5, 2, 3, 7, 0
%endif
%endmacro
%macro VP9_IDCT_IDCT_32x32_ADD_XMM 1
INIT_XMM %1
-cglobal vp9_idct_idct_32x32_add, 4, 8, 16, 2048, dst, stride, block, eob
+cglobal vp9_idct_idct_32x32_add, 4, 9, 16, 2048, dst, stride, block, eob
cmp eobd, 135
jg .idctfull
cmp eobd, 34
VP9_STORE_2XFULL 0, 1, 2, 3, 4, 5, mmsize
RET
- DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2
+ DEFINE_ARGS dst_bak, stride, block, cnt, dst, stride30, dst_end, stride2, tmp
.idct8x8:
+ mov tmpq, rsp
VP9_IDCT32_1D blockq, 1, 8
mov stride30q, strideq ; stride
.loop2_8x8:
mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q]
- VP9_IDCT32_1D rsp, 2, 8
+ VP9_IDCT32_1D tmpq, 2, 8
add dst_bakq, 8
- add rsp, 16
+ add tmpq, 16
dec cntd
jg .loop2_8x8
- sub rsp, 64
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
.idct16x16:
mov cntd, 2
+ mov tmpq, rsp
.loop1_16x16:
VP9_IDCT32_1D blockq, 1, 16
add blockq, 16
- add rsp, 512
+ add tmpq, 512
dec cntd
jg .loop1_16x16
sub blockq, 32
- sub rsp, 1024
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32
mov cntd, 4
+ mov tmpq, rsp
sub stride30q, stride2q ; stride*30
.loop2_16x16:
mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q]
- VP9_IDCT32_1D rsp, 2, 16
+ VP9_IDCT32_1D tmpq, 2, 16
add dst_bakq, 8
- add rsp, 16
+ add tmpq, 16
dec cntd
jg .loop2_16x16
- sub rsp, 64
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients
.idctfull:
mov cntd, 4
+ mov tmpq, rsp
.loop1_full:
VP9_IDCT32_1D blockq, 1
add blockq, 16
- add rsp, 512
+ add tmpq, 512
dec cntd
jg .loop1_full
sub blockq, 64
- sub rsp, 2048
mov stride30q, strideq ; stride
lea stride2q, [strideq*2] ; stride*2
shl stride30q, 5 ; stride*32
mov cntd, 4
+ mov tmpq, rsp
sub stride30q, stride2q ; stride*30
.loop2_full:
mov dstq, dst_bakq
lea dst_endq, [dst_bakq+stride30q]
- VP9_IDCT32_1D rsp, 2
+ VP9_IDCT32_1D tmpq, 2
add dst_bakq, 8
- add rsp, 16
+ add tmpq, 16
dec cntd
jg .loop2_full
- sub rsp, 64
; at the end of the loop, m7 should still be zero
; use that to zero out block coefficients