psubw %2, %1
%endmacro
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
+%macro SBUTTERFLY 4
+ mova m%4, m%2
+ punpckl%1 m%2, m%3
+ punpckh%1 m%4, m%3
+ SWAP %3, %4
%endmacro
-%macro SBUTTERFLY 5
- mov%1 %5, %3
- punpckl%2 %3, %4
- punpckh%2 %5, %4
-%endmacro
-
-; input ABCD output ADTC
%macro TRANSPOSE4x4W 5
- SBUTTERFLY q, wd, %1, %2, %5
- SBUTTERFLY q, wd, %3, %4, %2
- SBUTTERFLY q, dq, %1, %3, %4
- SBUTTERFLY q, dq, %5, %2, %3
+ SBUTTERFLY wd, %1, %2, %5
+ SBUTTERFLY wd, %3, %4, %5
+ SBUTTERFLY dq, %1, %3, %5
+ SBUTTERFLY dq, %2, %4, %5
+ SWAP %2, %3
%endmacro
-; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
-%macro LOAD_DIFF_8P 7
- movq %1, %5
- movq %2, %1
- punpcklbw %1, %7
- punpckhbw %2, %7
- movq %3, %6
- movq %4, %3
- punpcklbw %3, %7
- punpckhbw %4, %7
- psubw %1, %3
- psubw %2, %4
-%endmacro
-
-%macro LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
- movq %2, %3
- movq %1, %4
- SUMSUB_BA %1, %2
+%macro LOAD_DIFF_8P 4
+ movh %1, %3
+ movh %2, %4
+ punpcklbw %1, %2
+ punpcklbw %2, %2
+ psubw %1, %2
%endmacro
%macro STORE_DIFF_8P 4
psraw %1, 6
- movq %3, %2
+ movh %3, %2
punpcklbw %3, %4
paddsw %1, %3
packuswb %1, %1
- movq %2, %1
+ movh %2, %1
%endmacro
+; in: m0..m7
+; out: 0,4,6 in mem, rest in regs
+%macro DCT8_1D 9
+ SUMSUB_BA m%8, m%1 ; %8 = s07, %1 = d07
+ SUMSUB_BA m%7, m%2 ; %7 = s16, %2 = d16
+ SUMSUB_BA m%6, m%3 ; %6 = s25, %3 = d25
+ SUMSUB_BA m%5, m%4 ; %5 = s34, %4 = d34
+ SUMSUB_BA m%5, m%8 ; %5 = a0, %8 = a2
+ SUMSUB_BA m%6, m%7 ; %6 = a1, %7 = a3
+ SUMSUB_BA m%6, m%5 ; %6 = dst0, %5 = dst4
+ mova [%9+0x00], m%6
+ mova [%9+0x40], m%5
+ mova m%6, m%7 ; a3
+ psraw m%6, 1 ; a3>>1
+ paddw m%6, m%8 ; a2 + (a3>>1)
+ psraw m%8, 1 ; a2>>1
+ psubw m%8, m%7 ; (a2>>1) - a3
+ mova [%9+0x60], m%8
+ mova m%5, m%3
+ psraw m%5, 1
+ paddw m%5, m%3 ; d25+(d25>>1)
+ mova m%7, m%1
+ psubw m%7, m%4 ; a5 = d07-d34-(d25+(d25>>1))
+ psubw m%7, m%5
+ mova m%5, m%2
+ psraw m%5, 1
+ paddw m%5, m%2 ; d16+(d16>>1)
+ mova m%8, m%1
+ paddw m%8, m%4
+ psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
+ mova m%5, m%1
+ psraw m%5, 1
+ paddw m%5, m%1 ; d07+(d07>>1)
+ paddw m%5, m%2
+ paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
+ mova m%1, m%4
+ psraw m%1, 1
+ paddw m%1, m%4 ; d34+(d34>>1)
+ paddw m%1, m%2
+ psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
+ mova m%4, m%1
+ psraw m%4, 2
+ paddw m%4, m%5 ; a4 + (a7>>2)
+ mova m%3, m%8
+ psraw m%3, 2
+ paddw m%3, m%7 ; a5 + (a6>>2)
+ psraw m%5, 2
+ psraw m%7, 2
+ psubw m%5, m%1 ; (a4>>2) - a7
+ psubw m%8, m%7 ; a6 - (a5>>2)
+ SWAP %2, %4, %3, %6, %8, %5
+%endmacro
-;-----------------------------------------------------------------------------
-; void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_sub_8x8_mmx:
- pxor mm7, mm7
- %assign i 0
- %rep 8
- LOAD_DIFF_8P mm0, mm1, mm2, mm3, [r1], [r2], mm7
- movq [r0+i], mm0
- movq [r0+i+8], mm1
- add r1, FENC_STRIDE
- add r2, FDEC_STRIDE
- %assign i i+16
- %endrep
- ret
+; in: 0,4 in mem, rest in regs
+; out: m0..m7
+%macro IDCT8_1D 9
+ mova m%1, m%3
+ mova m%5, m%7
+ psraw m%3, 1
+ psraw m%7, 1
+ psubw m%3, m%5
+ paddw m%7, m%1
+ mova m%5, m%2
+ psraw m%5, 1
+ paddw m%5, m%2
+ paddw m%5, m%4
+ paddw m%5, m%6
+ mova m%1, m%6
+ psraw m%1, 1
+ paddw m%1, m%6
+ paddw m%1, m%8
+ psubw m%1, m%2
+ psubw m%2, m%4
+ psubw m%6, m%4
+ paddw m%2, m%8
+ psubw m%6, m%8
+ psraw m%4, 1
+ psraw m%8, 1
+ psubw m%2, m%4
+ psubw m%6, m%8
+ mova m%4, m%5
+ mova m%8, m%1
+ psraw m%4, 2
+ psraw m%8, 2
+ paddw m%4, m%6
+ paddw m%8, m%2
+ psraw m%6, 2
+ psraw m%2, 2
+ psubw m%5, m%6
+ psubw m%2, m%1
+ mova m%1, [%9+0x00]
+ mova m%6, [%9+0x40]
+ SUMSUB_BA m%6, m%1
+ SUMSUB_BA m%7, m%6
+ SUMSUB_BA m%3, m%1
+ SUMSUB_BA m%5, m%7
+ SUMSUB_BA m%2, m%3
+ SUMSUB_BA m%8, m%1
+ SUMSUB_BA m%4, m%6
+ SWAP %1, %5, %6
+ SWAP %3, %8, %7
+%endmacro
-;-----------------------------------------------------------------------------
-; void x264_ydct8_mmx( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
+INIT_MMX
ALIGN 16
-x264_ydct8_mmx:
- ;-------------------------------------------------------------------------
- ; vertical dct ( compute 4 columns at a time -> 2 loops )
- ;-------------------------------------------------------------------------
- %assign i 0
- %rep 2
-
- LOADSUMSUB mm2, mm3, [r0+i+0*16], [r0+i+7*16] ; mm2 = s07, mm3 = d07
- LOADSUMSUB mm1, mm5, [r0+i+1*16], [r0+i+6*16] ; mm1 = s16, mm5 = d16
- LOADSUMSUB mm0, mm6, [r0+i+2*16], [r0+i+5*16] ; mm0 = s25, mm6 = d25
- LOADSUMSUB mm4, mm7, [r0+i+3*16], [r0+i+4*16] ; mm4 = s34, mm7 = d34
-
- SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2
- SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3
- SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4
-
- movq [r0+i+0*16], mm0
- movq [r0+i+4*16], mm4
-
- movq mm0, mm1 ; a3
- psraw mm0, 1 ; a3>>1
- paddw mm0, mm2 ; a2 + (a3>>1)
- psraw mm2, 1 ; a2>>1
- psubw mm2, mm1 ; (a2>>1) - a3
-
- movq [r0+i+2*16], mm0
- movq [r0+i+6*16], mm2
-
- movq mm0, mm6
- psraw mm0, 1
- paddw mm0, mm6 ; d25+(d25>>1)
- movq mm1, mm3
- psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1))
- psubw mm1, mm0
-
- movq mm0, mm5
- psraw mm0, 1
- paddw mm0, mm5 ; d16+(d16>>1)
- movq mm2, mm3
- paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1))
- psubw mm2, mm0
-
- movq mm0, mm3
- psraw mm0, 1
- paddw mm0, mm3 ; d07+(d07>>1)
- paddw mm0, mm5
- paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1))
-
- movq mm3, mm7
- psraw mm3, 1
- paddw mm3, mm7 ; d34+(d34>>1)
- paddw mm3, mm5
- psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1))
-
- movq mm7, mm3
- psraw mm7, 2
- paddw mm7, mm0 ; a4 + (a7>>2)
-
- movq mm6, mm2
- psraw mm6, 2
- paddw mm6, mm1 ; a5 + (a6>>2)
-
- psraw mm0, 2
- psraw mm1, 2
- psubw mm0, mm3 ; (a4>>2) - a7
- psubw mm2, mm1 ; a6 - (a5>>2)
-
- movq [r0+i+1*16], mm7
- movq [r0+i+3*16], mm6
- movq [r0+i+5*16], mm2
- movq [r0+i+7*16], mm0
-
- %assign i i+8
- %endrep
+load_diff_4x8_mmx:
+ LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+ movq [r0], m0
+ LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ movq m0, [r0]
ret
-;-----------------------------------------------------------------------------
-; void x264_yidct8_mmx( int16_t dest[8][8] );
-;-----------------------------------------------------------------------------
+INIT_MMX
ALIGN 16
-x264_yidct8_mmx:
- ;-------------------------------------------------------------------------
- ; vertical idct ( compute 4 columns at a time -> 2 loops )
- ;-------------------------------------------------------------------------
- %assign i 0
- %rep 2
-
- movq mm1, [r0+i+1*16] ; mm1 = d1
- movq mm3, [r0+i+3*16] ; mm3 = d3
- movq mm5, [r0+i+5*16] ; mm5 = d5
- movq mm7, [r0+i+7*16] ; mm7 = d7
-
- movq mm4, mm7
- psraw mm4, 1
- movq mm0, mm5
- psubw mm0, mm7
- psubw mm0, mm4
- psubw mm0, mm3 ; mm0 = e1
-
- movq mm6, mm3
- psraw mm6, 1
- movq mm2, mm7
- psubw mm2, mm6
- psubw mm2, mm3
- paddw mm2, mm1 ; mm2 = e3
-
- movq mm4, mm5
- psraw mm4, 1
- paddw mm4, mm5
- paddw mm4, mm7
- psubw mm4, mm1 ; mm4 = e5
-
- movq mm6, mm1
- psraw mm6, 1
- paddw mm6, mm1
- paddw mm6, mm5
- paddw mm6, mm3 ; mm6 = e7
-
- movq mm1, mm0
- movq mm3, mm4
- movq mm5, mm2
- movq mm7, mm6
- psraw mm6, 2
- psraw mm3, 2
- psraw mm5, 2
- psraw mm0, 2
- paddw mm1, mm6 ; mm1 = f1
- paddw mm3, mm2 ; mm3 = f3
- psubw mm5, mm4 ; mm5 = f5
- psubw mm7, mm0 ; mm7 = f7
-
- movq mm2, [r0+i+2*16] ; mm2 = d2
- movq mm6, [r0+i+6*16] ; mm6 = d6
- movq mm4, mm2
- movq mm0, mm6
- psraw mm4, 1
- psraw mm6, 1
- psubw mm4, mm0 ; mm4 = a4
- paddw mm6, mm2 ; mm6 = a6
-
- movq mm2, [r0+i+0*16] ; mm2 = d0
- movq mm0, [r0+i+4*16] ; mm0 = d4
- SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
-
- SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
- ; mm4 = f2, mm2 = f4
-
- SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
- ; mm5 = g1, mm4 = g6
- SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
- ; mm1 = g3, mm0 = g4
-
- movq [r0+i+0*16], mm7
- movq [r0+i+1*16], mm5
- movq [r0+i+2*16], mm3
- movq [r0+i+3*16], mm1
- movq [r0+i+4*16], mm0
- movq [r0+i+5*16], mm2
- movq [r0+i+6*16], mm4
- movq [r0+i+7*16], mm6
-
- %assign i i+8
- %endrep
+dct8_mmx:
+ DCT8_1D 0,1,2,3,4,5,6,7,r0
+ SAVE_MM_PERMUTATION dct8_mmx
ret
-;-----------------------------------------------------------------------------
-; void x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_pixel_add_8x8_mmx:
- pxor mm7, mm7
- %assign i 0
- %rep 8
- movq mm0, [r0]
- movq mm2, [r1+i]
- movq mm3, [r1+i+8]
- movq mm1, mm0
- psraw mm2, 6
- psraw mm3, 6
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- packuswb mm0, mm1
- movq [r0], mm0
- add r0, FDEC_STRIDE
- %assign i i+16
+%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
+ %xdefine %%base %1
+ %rep %0/2
+ %xdefine %%tmp m%2
+ %rotate %0/2
+ mova [%%base + %2*16], %%tmp
+ %rotate 1-%0/2
%endrep
- ret
-
-;-----------------------------------------------------------------------------
-; void x264_transpose_8x8_mmx( int16_t src[8][8] );
-;-----------------------------------------------------------------------------
-ALIGN 16
-x264_transpose_8x8_mmx:
- movq mm0, [r0 ]
- movq mm1, [r0+ 16]
- movq mm2, [r0+ 32]
- movq mm3, [r0+ 48]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq [r0 ], mm0
- movq [r0+ 16], mm3
- movq [r0+ 32], mm4
- movq [r0+ 48], mm2
+%endmacro
- movq mm0, [r0+ 72]
- movq mm1, [r0+ 88]
- movq mm2, [r0+104]
- movq mm3, [r0+120]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq [r0+ 72], mm0
- movq [r0+ 88], mm3
- movq [r0+104], mm4
- movq [r0+120], mm2
+%macro UNSPILL_SHUFFLE 3-*
+ %xdefine %%base %1
+ %rep %0/2
+ %xdefine %%tmp m%2
+ %rotate %0/2
+ mova %%tmp, [%%base + %2*16]
+ %rotate 1-%0/2
+ %endrep
+%endmacro
- movq mm0, [r0+ 8]
- movq mm1, [r0+ 24]
- movq mm2, [r0+ 40]
- movq mm3, [r0+ 56]
- TRANSPOSE4x4W mm0, mm1, mm2, mm3, mm4
- movq mm1, [r0+ 64]
- movq mm5, [r0+ 80]
- movq mm6, [r0+ 96]
- movq mm7, [r0+112]
+%macro SPILL 2+ ; assume offsets are the same as reg numbers
+ SPILL_SHUFFLE %1, %2, %2
+%endmacro
- movq [r0+ 64], mm0
- movq [r0+ 80], mm3
- movq [r0+ 96], mm4
- movq [r0+112], mm2
- TRANSPOSE4x4W mm1, mm5, mm6, mm7, mm4
- movq [r0+ 8], mm1
- movq [r0+ 24], mm7
- movq [r0+ 40], mm4
- movq [r0+ 56], mm6
- ret
+%macro UNSPILL 2+
+ UNSPILL_SHUFFLE %1, %2, %2
+%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_mmx, 3,3
- call x264_pixel_sub_8x8_mmx
- call x264_ydct8_mmx
- call x264_transpose_8x8_mmx
- jmp x264_ydct8_mmx
-
-;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
-;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_mmx, 0,1
- mov r0, r1m
- add word [r0], 32
- call x264_yidct8_mmx
- call x264_transpose_8x8_mmx
- call x264_yidct8_mmx
- mov r1, r0
- mov r0, r0m
- jmp x264_pixel_add_8x8_mmx
+global x264_sub8x8_dct8_mmx %+ .skip_prologue
+.skip_prologue:
+ INIT_MMX
+ call load_diff_4x8_mmx
+ call dct8_mmx
+ UNSPILL r0, 0
+ TRANSPOSE4x4W 0,1,2,3,4
+ SPILL r0, 0,1,2,3
+ UNSPILL r0, 4,6
+ TRANSPOSE4x4W 4,5,6,7,0
+ SPILL r0, 4,5,6,7
+ INIT_MMX
+ add r1, 4
+ add r2, 4
+ add r0, 8
+ call load_diff_4x8_mmx
+ sub r1, 4
+ sub r2, 4
+ call dct8_mmx
+ sub r0, 8
+ UNSPILL r0+8, 4,6
+ TRANSPOSE4x4W 4,5,6,7,0
+ SPILL r0+8, 4,5,6,7
+ UNSPILL r0+8, 0
+ TRANSPOSE4x4W 0,1,2,3,5
+ UNSPILL r0, 4,5,6,7
+ SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
+ movq mm4, m6 ; depends on the permutation to not produce conflicts
+ movq mm0, m4
+ movq mm1, m5
+ movq mm2, mm4
+ movq mm3, m7
+ INIT_MMX
+ UNSPILL r0+8, 4,5,6,7
+ add r0, 8
+ call dct8_mmx
+ sub r0, 8
+ SPILL r0+8, 1,2,3,5,7
+ INIT_MMX
+ UNSPILL r0, 0,1,2,3,4,5,6,7
+ call dct8_mmx
+ SPILL r0, 1,2,3,5,7
+ ret
-%macro IDCT8_1D 8
- movdqa %1, %3
- movdqa %5, %7
- psraw %3, 1
- psraw %7, 1
- psubw %3, %5
- paddw %7, %1
- movdqa %5, %2
- psraw %5, 1
- paddw %5, %2
- paddw %5, %4
- paddw %5, %6
- movdqa %1, %6
- psraw %1, 1
- paddw %1, %6
- paddw %1, %8
- psubw %1, %2
- psubw %2, %4
- psubw %6, %4
- paddw %2, %8
- psubw %6, %8
- psraw %4, 1
- psraw %8, 1
- psubw %2, %4
- psubw %6, %8
- movdqa %4, %5
- movdqa %8, %1
- psraw %4, 2
- psraw %8, 2
- paddw %4, %6
- paddw %8, %2
- psraw %6, 2
- psraw %2, 2
- psubw %5, %6
- psubw %2, %1
- movdqa %1, [eax+0x00]
- movdqa %6, [eax+0x40]
- SUMSUB_BA %6, %1
- SUMSUB_BA %7, %6
- SUMSUB_BA %3, %1
- SUMSUB_BA %5, %7
- SUMSUB_BA %2, %3
- SUMSUB_BA %8, %1
- SUMSUB_BA %4, %6
-%endmacro
+INIT_MMX
+ALIGN 16
+idct8_mmx:
+ IDCT8_1D 0,1,2,3,4,5,6,7,r1
+ SAVE_MM_PERMUTATION idct8_mmx
+ ret
-%macro TRANSPOSE8 9
- movdqa [%9], %8
- SBUTTERFLY dqa, wd, %1, %2, %8
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, wd, %3, %4, %2
- SBUTTERFLY dqa, wd, %5, %6, %4
- SBUTTERFLY dqa, wd, %7, %8, %6
- SBUTTERFLY dqa, dq, %1, %3, %8
- movdqa [%9], %8
- movdqa %8, [16+%9]
- SBUTTERFLY dqa, dq, %8, %2, %3
- SBUTTERFLY dqa, dq, %5, %7, %2
- SBUTTERFLY dqa, dq, %4, %6, %7
- SBUTTERFLY dqa, qdq, %1, %5, %6
- SBUTTERFLY dqa, qdq, %8, %4, %5
- movdqa [%9+16], %8
- movdqa %8, [%9]
- SBUTTERFLY dqa, qdq, %8, %2, %4
- SBUTTERFLY dqa, qdq, %3, %7, %2
- movdqa %7, [%9+16]
+%macro ADD_STORE_ROW 3
+ movq m1, [r0+%1*FDEC_STRIDE]
+ movq m2, m1
+ punpcklbw m1, m0
+ punpckhbw m2, m0
+ paddw m1, %2
+ paddw m2, %3
+ packuswb m1, m2
+ movq [r0+%1*FDEC_STRIDE], m1
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
+; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_add8x8_idct8_sse2
- mov ecx, [esp+4]
- mov eax, [esp+8]
- movdqa xmm1, [eax+0x10]
- movdqa xmm2, [eax+0x20]
- movdqa xmm3, [eax+0x30]
- movdqa xmm5, [eax+0x50]
- movdqa xmm6, [eax+0x60]
- movdqa xmm7, [eax+0x70]
- IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
- TRANSPOSE8 xmm4, xmm1, xmm7, xmm3, xmm5, xmm0, xmm2, xmm6, eax
- picgetgot edx
- paddw xmm4, [pw_32 GLOBAL]
- movdqa [eax+0x00], xmm4
- movdqa [eax+0x40], xmm2
- IDCT8_1D xmm4, xmm0, xmm6, xmm3, xmm2, xmm5, xmm7, xmm1
- movdqa [eax+0x60], xmm6
- movdqa [eax+0x70], xmm7
- pxor xmm7, xmm7
- STORE_DIFF_8P xmm2, [ecx+FDEC_STRIDE*0], xmm6, xmm7
- STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*1], xmm6, xmm7
- STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*2], xmm6, xmm7
- STORE_DIFF_8P xmm3, [ecx+FDEC_STRIDE*3], xmm6, xmm7
- STORE_DIFF_8P xmm5, [ecx+FDEC_STRIDE*4], xmm6, xmm7
- STORE_DIFF_8P xmm4, [ecx+FDEC_STRIDE*5], xmm6, xmm7
- movdqa xmm0, [eax+0x60]
- movdqa xmm1, [eax+0x70]
- STORE_DIFF_8P xmm0, [ecx+FDEC_STRIDE*6], xmm6, xmm7
- STORE_DIFF_8P xmm1, [ecx+FDEC_STRIDE*7], xmm6, xmm7
+cglobal x264_add8x8_idct8_mmx, 2,2
+global x264_add8x8_idct8_mmx %+ .skip_prologue
+.skip_prologue:
+ INIT_MMX
+ add word [r1], 32
+ UNSPILL r1, 1,2,3,5,6,7
+ call idct8_mmx
+ SPILL r1, 7
+ TRANSPOSE4x4W 0,1,2,3,7
+ SPILL r1, 0,1,2,3
+ UNSPILL r1, 7
+ TRANSPOSE4x4W 4,5,6,7,0
+ SPILL r1, 4,5,6,7
+ INIT_MMX
+ UNSPILL r1+8, 1,2,3,5,6,7
+ add r1, 8
+ call idct8_mmx
+ sub r1, 8
+ SPILL r1+8, 7
+ TRANSPOSE4x4W 0,1,2,3,7
+ SPILL r1+8, 0,1,2,3
+ UNSPILL r1+8, 7
+ TRANSPOSE4x4W 4,5,6,7,0
+ SPILL r1+8, 4,5,6,7
+ INIT_MMX
+ movq m3, [r1+0x08]
+ movq m0, [r1+0x40]
+ movq [r1+0x40], m3
+ movq [r1+0x08], m0
+ ; memory layout at this time:
+ ; A0------ A1------
+ ; B0------ F0------
+ ; C0------ G0------
+ ; D0------ H0------
+ ; E0------ E1------
+ ; B1------ F1------
+ ; C1------ G1------
+ ; D1------ H1------
+ UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
+ UNSPILL r1+8, 5,6,7
+ add r1, 8
+ call idct8_mmx
+ sub r1, 8
+ psraw m0, 6
+ psraw m1, 6
+ psraw m2, 6
+ psraw m3, 6
+ psraw m4, 6
+ psraw m5, 6
+ psraw m6, 6
+ psraw m7, 6
+ movq [r1+0x08], m0 ; mm4
+ movq [r1+0x48], m4 ; mm5
+ movq [r1+0x58], m5 ; mm0
+ movq [r1+0x68], m6 ; mm2
+ movq [r1+0x78], m7 ; mm6
+ movq mm5, [r1+0x18]
+ movq mm6, [r1+0x28]
+ movq [r1+0x18], m1 ; mm1
+ movq [r1+0x28], m2 ; mm7
+ movq mm7, [r1+0x38]
+ movq [r1+0x38], m3 ; mm3
+ movq mm1, [r1+0x10]
+ movq mm2, [r1+0x20]
+ movq mm3, [r1+0x30]
+ call idct8_mmx
+ psraw m0, 6
+ psraw m1, 6
+ psraw m2, 6
+ psraw m3, 6
+ psraw m4, 6
+ psraw m5, 6
+ psraw m6, 6
+ psraw m7, 6
+ SPILL r1, 0,1,2
+ pxor m0, m0
+ ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
+ ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
+ ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
+ ADD_STORE_ROW 3, m3, [r1+0x38]
+ ADD_STORE_ROW 4, m4, [r1+0x48]
+ ADD_STORE_ROW 5, m5, [r1+0x58]
+ ADD_STORE_ROW 6, m6, [r1+0x68]
+ ADD_STORE_ROW 7, m7, [r1+0x78]
ret
-;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
-;-----------------------------------------------------------------------------
-%macro SUB_NxN_DCT 4
-cglobal %1
- mov edx, [esp+12]
- mov ecx, [esp+ 8]
- mov eax, [esp+ 4]
- add edx, %4
- add ecx, %4
- add eax, %3
- push edx
- push ecx
- push eax
- call %2
- add dword [esp+0], %3
- add dword [esp+4], %4*FENC_STRIDE-%4
- add dword [esp+8], %4*FDEC_STRIDE-%4
- call %2
- add dword [esp+0], %3
- add dword [esp+4], %4
- add dword [esp+8], %4
- call %2
- add esp, 12
- jmp %2
+
+
+INIT_XMM
+
+; in: m0..m7, except m6 which is in [%9+0x60]
+; out: m0..m7, except m4 which is in [%9+0x40]
+%macro TRANSPOSE8x8W 9
+ SBUTTERFLY wd, %1, %2, %7
+ movdqa [%9+16], m%2
+ movdqa m%7, [%9+0x60]
+ SBUTTERFLY wd, %3, %4, %2
+ SBUTTERFLY wd, %5, %6, %2
+ SBUTTERFLY wd, %7, %8, %2
+ SBUTTERFLY dq, %1, %3, %2
+ movdqa [%9], m%3
+ movdqa m%2, [%9+16]
+ SBUTTERFLY dq, %2, %4, %3
+ SBUTTERFLY dq, %5, %7, %3
+ SBUTTERFLY dq, %6, %8, %3
+ SBUTTERFLY qdq, %1, %5, %3
+ SBUTTERFLY qdq, %2, %6, %3
+ movdqa [%9+0x40], m%2
+ movdqa m%3, [%9]
+ SBUTTERFLY qdq, %3, %7, %2
+ SBUTTERFLY qdq, %4, %8, %2
+ SWAP %2, %5
+ SWAP %4, %7
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-%macro ADD_NxN_IDCT 4
-cglobal %1
- mov ecx, [esp+8]
- mov eax, [esp+4]
- add ecx, %3
- add eax, %4
- push ecx
- push eax
- call %2
- add dword [esp+0], %4*FDEC_STRIDE-%4
- add dword [esp+4], %3
- call %2
- add dword [esp+0], %4
- add dword [esp+4], %3
- call %2
- add esp, 8
- jmp %2
-%endmacro
-
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
-
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8
+cglobal x264_sub8x8_dct8_sse2, 3,3
+global x264_sub8x8_dct8_sse2 %+ .skip_prologue
+.skip_prologue:
+ LOAD_DIFF_8P m0, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF_8P m1, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF_8P m2, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF_8P m3, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF_8P m4, m7, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ LOAD_DIFF_8P m5, m7, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
+ SPILL r0, 0
+ LOAD_DIFF_8P m6, m7, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF_8P m7, m0, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
+ UNSPILL r0, 0
+ DCT8_1D 0,1,2,3,4,5,6,7,r0
+ UNSPILL r0, 0,4
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r0
+ UNSPILL r0, 4
+ DCT8_1D 0,1,2,3,4,5,6,7,r0
+ SPILL r0, 1,2,3,5,7
+ ret
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
+; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_field_mmx
- mov edx, [esp+8]
- mov ecx, [esp+4]
- punpcklwd mm0, [edx]
- punpckhwd mm1, [edx]
- punpcklwd mm2, [edx+8]
- punpckhwd mm3, [edx+8]
- punpcklwd mm4, [edx+16]
- punpckhwd mm5, [edx+16]
- punpcklwd mm6, [edx+24]
- punpckhwd mm7, [edx+24]
- psrad mm0, 16
- psrad mm1, 16
- psrad mm2, 16
- psrad mm3, 16
- psrad mm4, 16
- psrad mm5, 16
- psrad mm6, 16
- psrad mm7, 16
- movq [ecx ], mm0
- movq [ecx+16], mm2
- movq [ecx+24], mm3
- movq [ecx+32], mm4
- movq [ecx+40], mm5
- movq [ecx+48], mm6
- movq [ecx+56], mm7
- movq [ecx+12], mm1
- movd [ecx+ 8], mm2
+cglobal x264_add8x8_idct8_sse2, 2,2
+global x264_add8x8_idct8_sse2 %+ .skip_prologue
+.skip_prologue:
+ UNSPILL r1, 1,2,3,5,6,7
+ IDCT8_1D 0,1,2,3,4,5,6,7,r1
+ SPILL r1, 6
+ TRANSPOSE8x8W 0,1,2,3,4,5,6,7,r1
+ picgetgot edx
+ paddw m0, [pw_32 GLOBAL]
+ SPILL r1, 0
+ IDCT8_1D 0,1,2,3,4,5,6,7,r1
+ SPILL r1, 6,7
+ pxor m7, m7
+ STORE_DIFF_8P m0, [r0+FDEC_STRIDE*0], m6, m7
+ STORE_DIFF_8P m1, [r0+FDEC_STRIDE*1], m6, m7
+ STORE_DIFF_8P m2, [r0+FDEC_STRIDE*2], m6, m7
+ STORE_DIFF_8P m3, [r0+FDEC_STRIDE*3], m6, m7
+ STORE_DIFF_8P m4, [r0+FDEC_STRIDE*4], m6, m7
+ STORE_DIFF_8P m5, [r0+FDEC_STRIDE*5], m6, m7
+ UNSPILL_SHUFFLE r1, 0,1, 6,7
+ STORE_DIFF_8P m0, [r0+FDEC_STRIDE*6], m6, m7
+ STORE_DIFF_8P m1, [r0+FDEC_STRIDE*7], m6, m7
ret
+