%if ARCH_X86_64
+%macro define_constants 1
+ %undef w4_plus_w2
+ %undef w4_min_w2
+ %undef w4_plus_w6
+ %undef w4_min_w6
+ %undef w1_plus_w3
+ %undef w3_min_w1
+ %undef w7_plus_w3
+ %undef w3_min_w7
+ %define w4_plus_w2 w4_plus_w2%1
+ %define w4_min_w2 w4_min_w2%1
+ %define w4_plus_w6 w4_plus_w6%1
+ %define w4_min_w6 w4_min_w6%1
+ %define w1_plus_w3 w1_plus_w3%1
+ %define w3_min_w1 w3_min_w1%1
+ %define w7_plus_w3 w7_plus_w3%1
+ %define w3_min_w7 w3_min_w7%1
+%endmacro
+
; interleave data while maintaining source
; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
%macro SBUTTERFLY3 5
; %2 = row bias macro
; %3 = column shift
; %4 = column bias macro
-; %5 = min pixel value
-; %6 = max pixel value
-; %7 = qmat (for prores)
+; %5 = final action (nothing, "store", "put", "add")
+; %6 = min pixel value
+; %7 = max pixel value
+; %8 = qmat (for prores)
-%macro IDCT_FN 4-7
+%macro IDCT_FN 4-8
; for (i = 0; i < 8; i++)
; idctRowCondDC(block + i*8);
mova m10,[blockq+ 0] ; { row[0] }[0-7]
mova m13,[blockq+64] ; { row[4] }[0-7]
mova m12,[blockq+96] ; { row[6] }[0-7]
-%if %0 == 7
- pmullw m10,[%7+ 0]
- pmullw m8, [%7+32]
- pmullw m13,[%7+64]
- pmullw m12,[%7+96]
+%if %0 == 8
+ pmullw m10,[%8+ 0]
+ pmullw m8, [%8+32]
+ pmullw m13,[%8+64]
+ pmullw m12,[%8+96]
+
+ IDCT_1D %1, %2, %8
+%elif %2 == 11
+ ; This copies the DC-only shortcut. When there is only a DC coefficient the
+ ; C shifts the value and splats it to all coeffs rather than multiplying and
+ ; doing the full IDCT. This causes a difference on 8-bit because the
+ ; coefficient is 16383 rather than 16384 (which you can get with shifting).
+ por m1, m8, m13
+ por m1, m12
+ por m1, [blockq+ 16] ; { row[1] }[0-7]
+ por m1, [blockq+ 48] ; { row[3] }[0-7]
+ por m1, [blockq+ 80] ; { row[5] }[0-7]
+ por m1, [blockq+112] ; { row[7] }[0-7]
+ pxor m2, m2
+ pcmpeqw m1, m2
+ psllw m2, m10, 3
+ pand m2, m1
+ pcmpeqb m3, m3
+ pxor m1, m3
+ mova [rsp], m1
+ mova [rsp+16], m2
+
+ IDCT_1D %1, %2
- IDCT_1D %1, %2, %7
+ mova m5, [rsp]
+ mova m6, [rsp+16]
+ pand m8, m5
+ por m8, m6
+ pand m0, m5
+ por m0, m6
+ pand m1, m5
+ por m1, m6
+ pand m2, m5
+ por m2, m6
+ pand m4, m5
+ por m4, m6
+ pand m11, m5
+ por m11, m6
+ pand m9, m5
+ por m9, m6
+ pand m10, m5
+ por m10, m6
%else
IDCT_1D %1, %2
%endif
IDCT_1D %3, %4
; clip/store
-%if %0 == 4
+%if %0 >= 5
+%ifidn %5,"store"
; No clamping, means pure idct
mova [blockq+ 0], m8
mova [blockq+ 16], m0
mova [blockq+ 80], m11
mova [blockq+ 96], m9
mova [blockq+112], m10
-%else
-%ifidn %5, 0
+%elifidn %5,"put"
+%ifidn %6, 0
pxor m3, m3
%else
- mova m3, [%5]
-%endif
- mova m5, [%6]
+ mova m3, [%6]
+%endif ; ifidn %6, 0
+ mova m5, [%7]
pmaxsw m8, m3
pmaxsw m0, m3
pmaxsw m1, m3
mova [r0+r1 ], m11
mova [r0+r1*2], m9
mova [r0+r2 ], m10
-%endif
+%endif ; %5 action
+%endif; if %0 >= 5
%endmacro
%endif