times 8 dw 25080
times 8 dw 12785
+pb_7: times 8 db 0x07
+pb_1F: times 8 db 0x1f
+pb_81: times 8 db 0x81
+
cextern pb_1
cextern pb_3
-cextern pb_7
-cextern pb_1F
cextern pb_80
-cextern pb_81
cextern pw_8
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
-%define I(x) [%1+16* x ]
-%define J(x) [%1+16*(x-4)+8]
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
RowIDCT
Transpose
-%define I(x) [%1+16* x +64]
-%define J(x) [%1+16*(x-4)+72]
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
RowIDCT
Transpose
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
+%define I(x) [%1+16* x +64]
+%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+ packsswb m0, [r2+mmsize*8+%%i]
+ packsswb m1, [r2+mmsize*10+%%i]
+ packsswb m2, [r2+mmsize*12+%%i]
+ packsswb m3, [r2+mmsize*14+%%i]
+%else
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
+%endif
paddb m0, m4
paddb m1, m4
paddb m2, m4
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
-%assign %%i %%i+64
+%assign %%i %%i+8
+%endrep
+
+ pxor m0, m0
+%assign %%offset 0
+%rep 128/mmsize
+ mova [r2+%%offset], m0
+%assign %%offset %%offset+mmsize
%endrep
RET
cglobal vp3_idct_add, 3, 4, 9
VP3_IDCT r2
- mov r3, 4
- pxor m4, m4
movsxdifnidn r1, r1d
-.loop:
+ lea r3, [r1*3]
+ pxor m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
movq m0, [r0]
movq m1, [r0+r1]
-%if mmsize == 8
- mova m2, m0
- mova m3, m1
-%endif
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
punpcklbw m0, m4
punpcklbw m1, m4
-%if mmsize == 8
- punpckhbw m2, m4
- punpckhbw m3, m4
-%endif
- paddsw m0, [r2+ 0]
- paddsw m1, [r2+16]
-%if mmsize == 8
- paddsw m2, [r2+ 8]
- paddsw m3, [r2+24]
- packuswb m0, m2
- packuswb m1, m3
-%else ; mmsize == 16
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m3, [r2+48+%%i]
packuswb m0, m1
+ packuswb m2, m3
+ movq [r0 ], m0
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m2
+ movhps [r0+r3 ], m2
+%if %%i == 0
+ lea r0, [r0+r1*4]
%endif
- movq [r0 ], m0
-%if mmsize == 8
- movq [r0+r1], m1
-%else ; mmsize == 16
- movhps [r0+r1], m0
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+ movq m0, [r0]
+ movq m1, [r0+r1]
+ movq m2, [r0+r1*2]
+ movq m3, [r0+r3]
+ movq m5, m0
+ movq m6, m1
+ movq m7, m2
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+ punpcklbw m2, m4
+ punpckhbw m5, m4
+ punpckhbw m6, m4
+ punpckhbw m7, m4
+ paddsw m0, [r2+ 0+%%i]
+ paddsw m1, [r2+16+%%i]
+ paddsw m2, [r2+32+%%i]
+ paddsw m5, [r2+64+%%i]
+ paddsw m6, [r2+80+%%i]
+ paddsw m7, [r2+96+%%i]
+ packuswb m0, m5
+ movq m5, m3
+ punpcklbw m3, m4
+ punpckhbw m5, m4
+ packuswb m1, m6
+ paddsw m3, [r2+48+%%i]
+ paddsw m5, [r2+112+%%i]
+ packuswb m2, m7
+ packuswb m3, m5
+ movq [r0 ], m0
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%assign %%i %%i+8
+%endrep
%endif
- lea r0, [r0+r1*2]
- add r2, 32
- dec r3
- jg .loop
+%assign %%i 0
+%rep 128/mmsize
+ mova [r2+%%i], m4
+%assign %%i %%i+mmsize
+%endrep
RET
%endmacro
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
- movq m5, [r0+r3 ]
+ movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
- movq [r0+r3 ], m5
+ movq [r0+r2 ], m5
%endmacro
INIT_MMX mmxext
%if ARCH_X86_64
movsxd r1, r1d
%endif
- lea r3, [r1*3]
- movsx r2, word [r2]
- add r2, 15
- sar r2, 5
- movd m0, r2d
+ movsx r3, word [r2]
+ mov word [r2], 0
+ lea r2, [r1*3]
+ add r3, 15
+ sar r3, 5
+ movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0