;*****************************************************************************
-;* dct-a.asm: h264 encoder library
+;* dct-a.asm: x86 transform and zigzag
;*****************************************************************************
-;* Copyright (C) 2003-2008 x264 project
+;* Copyright (C) 2003-2011 x264 project
;*
-;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
-;* Min Chen <chenm001.163.com>
+;* Authors: Holger Lubitz <holger@lubitz.org>
;* Loren Merritt <lorenm@u.washington.edu>
+;* Laurent Aimar <fenrir@via.ecp.fr>
+;* Min Chen <chenm001.163.com>
+;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
+%include "x86util.asm"
-SECTION_RODATA
-pw_1: times 8 dw 1
-pw_32: times 8 dw 32
-pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
-
-SECTION .text
-
-%macro LOAD_DIFF_4P 5
- movh %1, %4
- punpcklbw %1, %3
- movh %2, %5
- punpcklbw %2, %3
- psubw %1, %2
+%macro SHUFFLE_16BIT 8
+ %rep 8
+ db %1*2
+ db %1*2+1
+ %rotate 1
+ %endrep
%endmacro
-%macro SUMSUB_BA 2
- paddw %1, %2
- paddw %2, %2
- psubw %2, %1
-%endmacro
-
-%macro SUMSUB_BADC 4
- paddw %1, %2
- paddw %3, %4
- paddw %2, %2
- paddw %4, %4
- psubw %2, %1
- psubw %4, %3
-%endmacro
-
-%macro SUMSUB2_AB 3
- mova %3, %1
- paddw %1, %1
- paddw %1, %2
- psubw %3, %2
- psubw %3, %2
-%endmacro
-
-%macro SUMSUBD2_AB 4
- mova %4, %1
- mova %3, %2
- psraw %2, 1
- psraw %4, 1
- paddw %1, %2
- psubw %4, %3
-%endmacro
+SECTION_RODATA
+pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
+pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
+pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
+pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
+pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
+pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
+pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-%macro SBUTTERFLY 4
- mova m%4, m%2
- punpckl%1 m%2, m%3
- punpckh%1 m%4, m%3
- SWAP %3, %4
-%endmacro
+SECTION .text
-%macro TRANSPOSE4x4W 5
- SBUTTERFLY wd, %1, %2, %5
- SBUTTERFLY wd, %3, %4, %5
- SBUTTERFLY dq, %1, %3, %5
- SBUTTERFLY dq, %2, %4, %5
- SWAP %2, %3
-%endmacro
+cextern pw_32_0
+cextern pw_32
+cextern pw_8000
+cextern pw_pixel_max
+cextern hsub_mul
+cextern pb_1
+cextern pw_1
+cextern pd_1
+cextern pd_32
-%macro TRANSPOSE2x4x4W 5
- SBUTTERFLY wd, %1, %2, %5
- SBUTTERFLY wd, %3, %4, %5
- SBUTTERFLY dq, %1, %3, %5
- SBUTTERFLY dq, %2, %4, %5
- SBUTTERFLY qdq, %1, %2, %5
- SBUTTERFLY qdq, %3, %4, %5
+%macro WALSH4_1D 6
+ SUMSUB_BADC %1, %5, %4, %3, %2, %6
+ SUMSUB_BADC %1, %5, %3, %4, %2, %6
+ SWAP %2, %5, %4
%endmacro
-%macro STORE_DIFF_4P 4
- psraw %1, 6
- movh %2, %4
- punpcklbw %2, %3
- paddsw %1, %2
- packuswb %1, %1
- movh %4, %1
+%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
+ movq m%3, m%4
+ pxor m%1, m%4
+ psubw m%3, m%2
+ pxor m%2, m%4
+ pavgw m%3, m%1
+ pavgw m%2, m%1
+ pxor m%3, m%4
+ pxor m%2, m%4
+ SWAP %1, %2, %3
%endmacro
-%macro HADAMARD4_1D 4
- SUMSUB_BADC m%2, m%1, m%4, m%3
- SUMSUB_BADC m%4, m%2, m%3, m%1
- SWAP %1, %4, %3
+%macro DCT_UNPACK 3
+ punpcklwd %3, %1
+ punpckhwd %2, %1
+ psrad %3, 16
+ psrad %2, 16
+ SWAP %1, %3
%endmacro
+%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void x264_dct4x4dc_mmx( int16_t d[4][4] )
+; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1,1
- movq m0, [r0+ 0]
- movq m1, [r0+ 8]
- movq m2, [r0+16]
+%macro DCT4x4_DC 1
+cglobal dct4x4dc_%1, 1,1,5
+ mova m0, [r0+ 0]
+ mova m1, [r0+16]
+ mova m2, [r0+32]
+ mova m3, [r0+48]
+ WALSH4_1D d, 0,1,2,3,4
+ TRANSPOSE4x4D 0,1,2,3,4
+ paddd m0, [pd_1]
+ WALSH4_1D d, 0,1,2,3,4
+ psrad m0, 1
+ psrad m1, 1
+ psrad m2, 1
+ psrad m3, 1
+ mova [r0+ 0], m0
+ mova [r0+16], m1
+ mova [r0+32], m2
+ mova [r0+48], m3
+ RET
+%endmacro ; DCT4x4_DC
+
+INIT_XMM
+DCT4x4_DC sse2
+INIT_AVX
+DCT4x4_DC avx
+%else
+
+INIT_MMX
+cglobal dct4x4dc_mmx, 1,1
movq m3, [r0+24]
- HADAMARD4_1D 0,1,2,3
+ movq m2, [r0+16]
+ movq m1, [r0+ 8]
+ movq m0, [r0+ 0]
+ movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
+ WALSH4_1D w, 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
- HADAMARD4_1D 0,1,2,3
- movq m6, [pw_1 GLOBAL]
- paddw m0, m6
- paddw m1, m6
- paddw m2, m6
- paddw m3, m6
- psraw m0, 1
- psraw m1, 1
- psraw m2, 1
- psraw m3, 1
+ SUMSUB_BADC w, 1, 0, 3, 2, 4
+ SWAP 0, 1
+ SWAP 2, 3
+ SUMSUB_17BIT 0,2,4,7
+ SUMSUB_17BIT 1,3,5,7
movq [r0+0], m0
- movq [r0+8], m1
- movq [r0+16], m2
- movq [r0+24], m3
+ movq [r0+8], m2
+ movq [r0+16], m3
+ movq [r0+24], m1
RET
+%endif ; HIGH_BIT_DEPTH
+%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void x264_idct4x4dc_mmx( int16_t d[4][4] )
+; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx, 1,1
- movq m0, [r0+ 0]
- movq m1, [r0+ 8]
- movq m2, [r0+16]
- movq m3, [r0+24]
- HADAMARD4_1D 0,1,2,3
+%macro IDCT4x4DC 1
+cglobal idct4x4dc_%1, 1,1
+ mova m3, [r0+48]
+ mova m2, [r0+32]
+ mova m1, [r0+16]
+ mova m0, [r0+ 0]
+ WALSH4_1D d,0,1,2,3,4
+ TRANSPOSE4x4D 0,1,2,3,4
+ WALSH4_1D d,0,1,2,3,4
+ mova [r0+ 0], m0
+ mova [r0+16], m1
+ mova [r0+32], m2
+ mova [r0+48], m3
+ RET
+%endmacro ; IDCT4x4DC
+
+INIT_XMM
+IDCT4x4DC sse2
+INIT_AVX
+IDCT4x4DC avx
+%else
+
+INIT_MMX
+;-----------------------------------------------------------------------------
+; void idct4x4dc( int16_t d[4][4] )
+;-----------------------------------------------------------------------------
+cglobal idct4x4dc_mmx, 1,1
+ movq m3, [r0+24]
+ movq m2, [r0+16]
+ movq m1, [r0+ 8]
+ movq m0, [r0+ 0]
+ WALSH4_1D w,0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
- HADAMARD4_1D 0,1,2,3
+ WALSH4_1D w,0,1,2,3,4
movq [r0+ 0], m0
movq [r0+ 8], m1
movq [r0+16], m2
movq [r0+24], m3
RET
+%endif ; HIGH_BIT_DEPTH
-%macro DCT4_1D 5
- SUMSUB_BADC m%4, m%1, m%3, m%2
- SUMSUB_BA m%3, m%4
- SUMSUB2_AB m%1, m%2, m%5
- SWAP %1, %3, %4, %5, %2
-%endmacro
-
-%macro IDCT4_1D 6
- SUMSUB_BA m%3, m%1
- SUMSUBD2_AB m%2, m%4, m%6, m%5
- SUMSUB_BADC m%2, m%3, m%5, m%1
- SWAP %1, %2, %5, %4, %3
-%endmacro
-
+INIT_MMX
+%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_mmx, 3,3
+cglobal sub4x4_dct_mmx, 3,3
.skip_prologue:
+ LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ DCT4_1D 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
+
+ SUMSUB_BADC w, 3, 0, 2, 1
+ SUMSUB_BA w, 2, 3, 4
+ DCT_UNPACK m2, m4, m5
+ DCT_UNPACK m3, m6, m7
+ mova [r0+ 0], m2 ; s03 + s12
+ mova [r0+ 8], m4
+ mova [r0+32], m3 ; s03 - s12
+ mova [r0+40], m6
+
+ DCT_UNPACK m0, m2, m4
+ DCT_UNPACK m1, m3, m5
+ SUMSUB2_AB d, 0, 1, 4
+ SUMSUB2_AB d, 2, 3, 5
+ mova [r0+16], m0 ; d03*2 + d12
+ mova [r0+24], m2
+ mova [r0+48], m4 ; d03 - 2*d12
+ mova [r0+56], m5
+ RET
+%else
+
%macro SUB_DCT4 1
- LOAD_DIFF_4P m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
- LOAD_DIFF_4P m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
- LOAD_DIFF_4P m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
- LOAD_DIFF_4P m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+cglobal sub4x4_dct_%1, 3,3
+%ifidn %1, mmx
+.skip_prologue:
+ LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m3, m4, m5, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
+ LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
+ LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+%else
+ mova m5, [hsub_mul]
+ LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
+%endif
DCT4_1D 0,1,2,3,4
- TRANSPOSE%1 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
DCT4_1D 0,1,2,3,4
movq [r0+ 0], m0
movq [r0+ 8], m1
movq [r0+16], m2
movq [r0+24], m3
-%endmacro
- SUB_DCT4 4x4W
RET
+%endmacro
+
+SUB_DCT4 mmx
+SUB_DCT4 ssse3
+%endif ; HIGH_BIT_DEPTH
+%ifdef HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
+; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2,1
-.skip_prologue:
- movq m0, [r1+ 0]
- movq m1, [r1+ 8]
- movq m2, [r1+16]
- movq m3, [r1+24]
-%macro ADD_IDCT4 1
- IDCT4_1D 0,1,2,3,4,5
- TRANSPOSE%1 0,1,2,3,4
- paddw m0, [pw_32 GLOBAL]
- IDCT4_1D 0,1,2,3,4,5
- pxor m7, m7
- STORE_DIFF_4P m0, m4, m7, [r0+0*FDEC_STRIDE]
- STORE_DIFF_4P m1, m4, m7, [r0+1*FDEC_STRIDE]
- STORE_DIFF_4P m2, m4, m7, [r0+2*FDEC_STRIDE]
- STORE_DIFF_4P m3, m4, m7, [r0+3*FDEC_STRIDE]
+%macro STORE_DIFFx2 6
+ psrad %1, 6
+ psrad %2, 6
+ packssdw %1, %2
+ movq %3, %5
+ movhps %3, %6
+ paddsw %1, %3
+ CLIPW %1, %4, [pw_pixel_max]
+ movq %5, %1
+ movhps %6, %1
%endmacro
- ADD_IDCT4 4x4W
+
+%macro ADD4x4_IDCT 1
+cglobal add4x4_idct_%1, 2,2,6
+ add r0, 4*FDEC_STRIDE
+.skip_prologue:
+ mova m1, [r1+16]
+ mova m3, [r1+48]
+ mova m2, [r1+32]
+ mova m0, [r1+ 0]
+ IDCT4_1D d,0,1,2,3,4,5
+ TRANSPOSE4x4D 0,1,2,3,4
+ paddd m0, [pd_32]
+ IDCT4_1D d,0,1,2,3,4,5
+ pxor m5, m5
+ STORE_DIFFx2 m0, m1, m4, m5, [r0-4*FDEC_STRIDE], [r0-2*FDEC_STRIDE]
+ STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDE], [r0+2*FDEC_STRIDE]
RET
+%endmacro
INIT_XMM
+ADD4x4_IDCT sse2
+INIT_AVX
+ADD4x4_IDCT avx
-cglobal x264_sub8x8_dct_sse2, 3,3
-.skip_prologue:
- call .8x4
- add r0, 64
- add r1, 4*FENC_STRIDE
- add r2, 4*FDEC_STRIDE
-.8x4:
- SUB_DCT4 2x4x4W
- movhps [r0+32], m0
- movhps [r0+40], m1
- movhps [r0+48], m2
- movhps [r0+56], m3
- ret
+%else ; !HIGH_BIT_DEPTH
-cglobal x264_add8x8_idct_sse2, 2,2,1
+cglobal add4x4_idct_mmx, 2,2
+ pxor m7, m7
.skip_prologue:
- call .8x4
- add r1, 64
- add r0, 4*FDEC_STRIDE
-.8x4:
- movq m0, [r1+ 0]
- movq m1, [r1+ 8]
- movq m2, [r1+16]
- movq m3, [r1+24]
- movhps m0, [r1+32]
- movhps m1, [r1+40]
- movhps m2, [r1+48]
- movhps m3, [r1+56]
- ADD_IDCT4 2x4x4W
- ret
+ movq m1, [r1+ 8]
+ movq m3, [r1+24]
+ movq m2, [r1+16]
+ movq m0, [r1+ 0]
+ IDCT4_1D w,0,1,2,3,4,5
+ TRANSPOSE4x4W 0,1,2,3,4
+ paddw m0, [pw_32]
+ IDCT4_1D w,0,1,2,3,4,5
+ STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
+ STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
+ STORE_DIFF m2, m4, m7, [r0+2*FDEC_STRIDE]
+ STORE_DIFF m3, m4, m7, [r0+3*FDEC_STRIDE]
+ RET
+
+%macro ADD4x4 1
+cglobal add4x4_idct_%1, 2,2,6
+ mova m1, [r1+0x00] ; row1/row0
+ mova m3, [r1+0x10] ; row3/row2
+ psraw m0, m1, 1 ; row1>>1/...
+ psraw m2, m3, 1 ; row3>>1/...
+ movsd m0, m1 ; row1>>1/row0
+ movsd m2, m3 ; row3>>1/row2
+ psubw m0, m3 ; row1>>1-row3/row0-2
+ paddw m2, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 wd, 0, 2, 1
+ SUMSUB_BA w, 2, 0, 1
+ pshuflw m1, m2, 10110001b
+ pshufhw m2, m2, 10110001b
+ punpckldq m1, m0
+ punpckhdq m2, m0
+ SWAP 0, 1
+
+ mova m1, [pw_32_0]
+ paddw m1, m0 ; row1/row0 corrected
+ psraw m0, 1 ; row1>>1/...
+ psraw m3, m2, 1 ; row3>>1/...
+ movsd m0, m1 ; row1>>1/row0
+ movsd m3, m2 ; row3>>1/row2
+ psubw m0, m2 ; row1>>1-row3/row0-2
+ paddw m3, m1 ; row3>>1+row1/row0+2
+ SBUTTERFLY2 qdq, 0, 3, 1
+ SUMSUB_BA w, 3, 0, 1
+
+ movd m4, [r0+FDEC_STRIDE*0]
+ movd m1, [r0+FDEC_STRIDE*1]
+ movd m2, [r0+FDEC_STRIDE*2]
+ movd m5, [r0+FDEC_STRIDE*3]
+ punpckldq m1, m4 ; row0/row1
+ pxor m4, m4
+ punpckldq m2, m5 ; row3/row2
+ punpcklbw m1, m4
+ psraw m3, 6
+ punpcklbw m2, m4
+ psraw m0, 6
+ paddsw m3, m1
+ paddsw m0, m2
+ packuswb m0, m3 ; row0/row1/row3/row2
+ pextrd [r0+FDEC_STRIDE*0], m0, 3
+ pextrd [r0+FDEC_STRIDE*1], m0, 2
+ movd [r0+FDEC_STRIDE*2], m0
+ pextrd [r0+FDEC_STRIDE*3], m0, 1
+ RET
+%endmacro ; ADD4x4
+INIT_XMM
+ADD4x4 sse4
+INIT_AVX
+ADD4x4 avx
+%endif ; HIGH_BIT_DEPTH
+
+INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
-cglobal %1, 3,3
+cglobal %1, 3,3,11*(mmsize/16)
+%ifndef HIGH_BIT_DEPTH
+%if mmsize == 8
+ pxor m7, m7
+%else
+ add r2, 4*FDEC_STRIDE
+ mova m7, [hsub_mul]
+%endif
+%endif ; !HIGH_BIT_DEPTH
.skip_prologue:
+%ifdef WIN64
+ sub rsp, 8
+%endif
call %2
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r0, %3
add r1, %4-%5-%6*FENC_STRIDE
add r2, %4-%5-%6*FDEC_STRIDE
+%ifdef WIN64
+ call %2
+ add rsp, 8
+ RET
+%else
jmp %2
+%endif
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
-%macro ADD_NxN_IDCT 6
-cglobal %1, 2,2,1
+%macro ADD_NxN_IDCT 6-7
+%ifdef HIGH_BIT_DEPTH
+cglobal %1, 2,2,6*(mmsize/16)
+%else
+cglobal %1, 2,2,11*(mmsize/16)
+ pxor m7, m7
+%endif
+%if mmsize==16
+ add r0, 4*FDEC_STRIDE
+%endif
.skip_prologue:
+%ifdef WIN64
+ sub rsp, 8
+%endif
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
call %2
add r0, %4-%5-%6*FDEC_STRIDE
add r1, %3
+%ifdef WIN64
+ call %2
+ add rsp, 8
+ RET
+%else
jmp %2
+%endif
%endmacro
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
+%ifdef HIGH_BIT_DEPTH
+INIT_MMX
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
+INIT_XMM
+ADD_NxN_IDCT add8x8_idct_sse2, add4x4_idct_sse2.skip_prologue,64, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_sse2,add8x8_idct_sse2.skip_prologue,64, 16, 8, 8
+INIT_AVX
+ADD_NxN_IDCT add8x8_idct_avx, add4x4_idct_avx.skip_prologue, 64, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_avx ,add8x8_idct_avx.skip_prologue, 64, 16, 8, 8
+%else ; !HIGH_BIT_DEPTH
+%ifndef ARCH_X86_64
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
+ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
+ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
+cextern sub8x8_dct8_mmx.skip_prologue
+cextern add8x8_idct8_mmx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+%endif
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
+INIT_XMM
+cextern sub8x8_dct_sse2.skip_prologue
+cextern sub8x8_dct_ssse3.skip_prologue
+cextern sub8x8_dct_avx.skip_prologue
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_avx, sub8x8_dct_avx.skip_prologue, 128, 8, 0, 0
-%ifndef ARCH_X86_64
-cextern x264_sub8x8_dct8_mmx.skip_prologue
-cextern x264_add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
-%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
-%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
+cextern add8x8_idct_sse2.skip_prologue
+cextern add8x8_idct_avx.skip_prologue
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct_avx, add8x8_idct_avx.skip_prologue, 128, 8, 0, 0
+
+cextern add8x8_idct8_sse2.skip_prologue
+cextern add8x8_idct8_avx.skip_prologue
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_avx, add8x8_idct8_avx.skip_prologue, 128, 8, 0, 0
+
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern sub8x8_dct8_ssse3.skip_prologue
+cextern sub8x8_dct8_avx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx.skip_prologue, 128, 8, 0, 0
+%endif ; HIGH_BIT_DEPTH
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+;-----------------------------------------------------------------------------
+; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
+;-----------------------------------------------------------------------------
+%macro ADD_DC 2
+ mova m0, [%1+FDEC_STRIDEB*0] ; 8pixels
+ mova m1, [%1+FDEC_STRIDEB*1]
+ mova m2, [%1+FDEC_STRIDEB*2]
+ paddsw m0, %2
+ paddsw m1, %2
+ paddsw m2, %2
+ paddsw %2, [%1+FDEC_STRIDEB*3]
+ CLIPW m0, m5, m6
+ CLIPW m1, m5, m6
+ CLIPW m2, m5, m6
+ CLIPW %2, m5, m6
+ mova [%1+FDEC_STRIDEB*0], m0
+ mova [%1+FDEC_STRIDEB*1], m1
+ mova [%1+FDEC_STRIDEB*2], m2
+ mova [%1+FDEC_STRIDEB*3], %2
+%endmacro
+
+%macro ADD_IDCT_DC 1
+cglobal add8x8_idct_dc_%1, 2,2,7
+ mova m6, [pw_pixel_max]
+ pxor m5, m5
+ mova m3, [r1]
+ paddd m3, [pd_32]
+ psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
+ pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
+ pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
+ pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ ADD_DC r0+FDEC_STRIDEB*0, m4
+ ADD_DC r0+FDEC_STRIDEB*4, m3
+ RET
+
+cglobal add16x16_idct_dc_%1, 2,3,8
+ mov r2, 4
+ mova m6, [pw_pixel_max]
+ mova m7, [pd_32]
+ pxor m5, m5
+.loop
+ mova m3, [r1]
+ paddd m3, m7
+ psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0
+ pshuflw m4, m3, 10100000b ; dc0 dc0 dc1 dc1 _ _ _ _
+ pshufhw m3, m3, 10100000b ; _ _ _ _ dc2 dc2 dc3 dc3
+ pshufd m4, m4, 01010000b ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ pshufd m3, m3, 11111010b ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ ADD_DC r0+FDEC_STRIDEB*0, m4
+ ADD_DC r0+SIZEOF_PIXEL*8, m3
+ add r1, 16
+ add r0, 4*FDEC_STRIDEB
+ dec r2
+ jg .loop
+ REP_RET
+%endmacro ; ADD_IDCT_DC
+
+INIT_XMM
+ADD_IDCT_DC sse2
+INIT_AVX
+ADD_IDCT_DC avx
+
+%else ;!HIGH_BIT_DEPTH
+%macro ADD_DC 3
+ movq mm4, [%3+FDEC_STRIDE*0]
+ movq mm5, [%3+FDEC_STRIDE*1]
+ movq mm6, [%3+FDEC_STRIDE*2]
+ paddusb mm4, %1
+ paddusb mm5, %1
+ paddusb mm6, %1
+ paddusb %1, [%3+FDEC_STRIDE*3]
+ psubusb mm4, %2
+ psubusb mm5, %2
+ psubusb mm6, %2
+ psubusb %1, %2
+ movq [%3+FDEC_STRIDE*0], mm4
+ movq [%3+FDEC_STRIDE*1], mm5
+ movq [%3+FDEC_STRIDE*2], mm6
+ movq [%3+FDEC_STRIDE*3], %1
+%endmacro
+
+cglobal add8x8_idct_dc_mmx, 2,2
+ movq mm0, [r1]
+ pxor mm1, mm1
+ add r0, FDEC_STRIDE*4
+ paddw mm0, [pw_32]
+ psraw mm0, 6
+ psubw mm1, mm0
+ packuswb mm0, mm0
+ packuswb mm1, mm1
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm2, mm0, 0xFA
+ pshufw mm3, mm1, 0xFA
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ ADD_DC mm0, mm1, r0-FDEC_STRIDE*4
+ ADD_DC mm2, mm3, r0
+ RET
+
+cglobal add8x8_idct_dc_ssse3, 2,2
+ movq xmm0, [r1]
+ pxor xmm1, xmm1
+ add r0, FDEC_STRIDE*4
+ paddw xmm0, [pw_32]
+ psraw xmm0, 6
+ psubw xmm1, xmm0
+ movdqa xmm5, [pb_idctdc_unpack]
+ packuswb xmm0, xmm0
+ packuswb xmm1, xmm1
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ movq xmm2, [r0+FDEC_STRIDE*-4]
+ movq xmm3, [r0+FDEC_STRIDE*-3]
+ movq xmm4, [r0+FDEC_STRIDE*-2]
+ movq xmm5, [r0+FDEC_STRIDE*-1]
+ movhps xmm2, [r0+FDEC_STRIDE* 0]
+ movhps xmm3, [r0+FDEC_STRIDE* 1]
+ movhps xmm4, [r0+FDEC_STRIDE* 2]
+ movhps xmm5, [r0+FDEC_STRIDE* 3]
+ paddusb xmm2, xmm0
+ paddusb xmm3, xmm0
+ paddusb xmm4, xmm0
+ paddusb xmm5, xmm0
+ psubusb xmm2, xmm1
+ psubusb xmm3, xmm1
+ psubusb xmm4, xmm1
+ psubusb xmm5, xmm1
+ movq [r0+FDEC_STRIDE*-4], xmm2
+ movq [r0+FDEC_STRIDE*-3], xmm3
+ movq [r0+FDEC_STRIDE*-2], xmm4
+ movq [r0+FDEC_STRIDE*-1], xmm5
+ movhps [r0+FDEC_STRIDE* 0], xmm2
+ movhps [r0+FDEC_STRIDE* 1], xmm3
+ movhps [r0+FDEC_STRIDE* 2], xmm4
+ movhps [r0+FDEC_STRIDE* 3], xmm5
+ RET
+
+cglobal add16x16_idct_dc_mmx, 2,3
+ mov r2, 4
+.loop:
+ movq mm0, [r1]
+ pxor mm1, mm1
+ paddw mm0, [pw_32]
+ psraw mm0, 6
+ psubw mm1, mm0
+ packuswb mm0, mm0
+ packuswb mm1, mm1
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ pshufw mm2, mm0, 0xFA
+ pshufw mm3, mm1, 0xFA
+ punpcklbw mm0, mm0
+ punpcklbw mm1, mm1
+ ADD_DC mm0, mm1, r0
+ ADD_DC mm2, mm3, r0+8
+ add r1, 8
+ add r0, FDEC_STRIDE*4
+ dec r2
+ jg .loop
+ REP_RET
+
+%macro IDCT_DC_STORE 3
+ movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
+ movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
+ movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
+ movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
+ paddusb xmm4, %2
+ paddusb xmm5, %2
+ paddusb xmm6, %2
+ paddusb xmm7, %2
+ psubusb xmm4, %3
+ psubusb xmm5, %3
+ psubusb xmm6, %3
+ psubusb xmm7, %3
+ movdqa [r0+%1+FDEC_STRIDE*0], xmm4
+ movdqa [r0+%1+FDEC_STRIDE*1], xmm5
+ movdqa [r0+%1+FDEC_STRIDE*2], xmm6
+ movdqa [r0+%1+FDEC_STRIDE*3], xmm7
+%endmacro
+
+cglobal add16x16_idct_dc_sse2, 2,2,8
+ call .loop
+ add r0, FDEC_STRIDE*4
+%ifdef WIN64
+ call .loop
+ RET
%endif
+.loop:
+ add r0, FDEC_STRIDE*4
+ movq xmm0, [r1+0]
+ movq xmm2, [r1+8]
+ add r1, 16
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm2, xmm2
+ pxor xmm3, xmm3
+ paddw xmm0, [pw_32]
+ paddw xmm2, [pw_32]
+ psraw xmm0, 6
+ psraw xmm2, 6
+ psubw xmm1, xmm3, xmm0
+ packuswb xmm0, xmm1
+ psubw xmm3, xmm2
+ punpckhbw xmm1, xmm0, xmm0
+ packuswb xmm2, xmm3
+ punpckhbw xmm3, xmm2, xmm2
+ punpcklbw xmm0, xmm0
+ punpcklbw xmm2, xmm2
+ IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+ IDCT_DC_STORE 0, xmm2, xmm3
+ ret
+
+%macro ADD16x16 1
+cglobal add16x16_idct_dc_%1, 2,2,8
+ call .loop
+ add r0, FDEC_STRIDE*4
+%ifdef WIN64
+ call .loop
+ RET
+%endif
+.loop:
+ add r0, FDEC_STRIDE*4
+ movdqa xmm0, [r1]
+ add r1, 16
+ pxor xmm1, xmm1
+ paddw xmm0, [pw_32]
+ psraw xmm0, 6
+ psubw xmm1, xmm0
+ movdqa xmm5, [ pb_idctdc_unpack]
+ movdqa xmm6, [pb_idctdc_unpack2]
+ packuswb xmm0, xmm0
+ packuswb xmm1, xmm1
+ pshufb xmm2, xmm0, xmm6
+ pshufb xmm0, xmm5
+ pshufb xmm3, xmm1, xmm6
+ pshufb xmm1, xmm5
+ IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
+ IDCT_DC_STORE 0, xmm2, xmm3
+ ret
+%endmacro ; ADD16x16
+
+INIT_XMM
+ADD16x16 ssse3
+INIT_AVX
+ADD16x16 avx
+
+%endif ; HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
+;-----------------------------------------------------------------------------
+
+%macro DCTDC_2ROW_MMX 3
+ movq %1, [r1+FENC_STRIDE*(0+%3)]
+ movq m1, [r1+FENC_STRIDE*(1+%3)]
+ movq m2, [r2+FDEC_STRIDE*(0+%3)]
+ movq m3, [r2+FDEC_STRIDE*(1+%3)]
+ movq %2, %1
+ punpckldq %1, m1
+ punpckhdq %2, m1
+ movq m1, m2
+ punpckldq m2, m3
+ punpckhdq m1, m3
+ pxor m3, m3
+ psadbw %1, m3
+ psadbw %2, m3
+ psadbw m2, m3
+ psadbw m1, m3
+ psubw %1, m2
+ psubw %2, m1
+%endmacro
-cextern x264_sub8x8_dct8_sse2
-cextern x264_add8x8_idct8_sse2
-SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
+%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
+ pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
+ pshufw mm0, %2, 10110001b ; s3 __ s2 __
+ paddw mm1, %2 ; s1 s13 s0 s02
+ psubw mm1, mm0 ; d13 s13 d02 s02
+ pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
+ psrlq mm1, 32 ; __ __ d13 s13
+ paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
+ psllq mm1, 32 ; d13 s13
+ psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
+%endmacro
+INIT_MMX
+cglobal sub8x8_dct_dc_mmxext, 3,3
+ DCTDC_2ROW_MMX m0, m4, 0
+ DCTDC_2ROW_MMX m5, m6, 2
+ paddw m0, m5
+ paddw m4, m6
+ punpckldq m0, m4
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ DCTDC_2ROW_MMX m7, m4, 0
+ DCTDC_2ROW_MMX m5, m6, 2
+ paddw m7, m5
+ paddw m4, m6
+ punpckldq m7, m4
+ DCT2x2 m0, m7
+ movq [r0], m0
+ ret
+INIT_XMM
+%macro DCTDC_2ROW_SSE2 3
+ movq m0, [r1+FENC_STRIDE*(0+%1)]
+ movq m1, [r1+FENC_STRIDE*(1+%1)]
+ movq m2, [r2+FDEC_STRIDE*(0+%1)]
+ movq m3, [r2+FDEC_STRIDE*(1+%1)]
+ punpckldq m0, m1
+ punpckldq m2, m3
+ psadbw m0, m7
+ psadbw m2, m7
+%if %2
+ paddw %3, m0
+ paddw m6, m2
+%else
+ SWAP %3, m0
+ SWAP m6, m2
+%endif
+%endmacro
+
+cglobal sub8x8_dct_dc_sse2, 3,3,8
+ pxor m7, m7
+ DCTDC_2ROW_SSE2 0, 0, m4
+ DCTDC_2ROW_SSE2 2, 1, m4
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ psubd m4, m6
+ DCTDC_2ROW_SSE2 0, 0, m5
+ DCTDC_2ROW_SSE2 2, 1, m5
+ psubd m5, m6
+ packssdw m4, m5
+ movhlps m5, m4
+ movdq2q mm0, m4
+ movdq2q mm7, m5
+ DCT2x2 mm0, mm7
+ movq [r0], mm0
+ RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+%macro SCAN_8x8 1
+cglobal zigzag_scan_8x8_frame_%1, 2,2,8
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ movdq2q mm0, xmm0
+ PALIGNR xmm1, xmm1, 14, xmm2
+ movdq2q mm1, xmm1
+
+ movdqa xmm2, [r1+32]
+ movdqa xmm3, [r1+48]
+ PALIGNR xmm2, xmm2, 12, xmm4
+ movdq2q mm2, xmm2
+ PALIGNR xmm3, xmm3, 10, xmm4
+ movdq2q mm3, xmm3
+
+ punpckhwd xmm0, xmm1
+ punpckhwd xmm2, xmm3
+
+ movq mm4, mm1
+ movq mm5, mm1
+ movq mm6, mm2
+ movq mm7, mm3
+ punpckhwd mm1, mm0
+ psllq mm0, 16
+ psrlq mm3, 16
+ punpckhdq mm1, mm1
+ punpckhdq mm2, mm0
+ punpcklwd mm0, mm4
+ punpckhwd mm4, mm3
+ punpcklwd mm4, mm2
+ punpckhdq mm0, mm2
+ punpcklwd mm6, mm3
+ punpcklwd mm5, mm7
+ punpcklwd mm5, mm6
+
+ movdqa xmm4, [r1+64]
+ movdqa xmm5, [r1+80]
+ movdqa xmm6, [r1+96]
+ movdqa xmm7, [r1+112]
+
+ movq [r0+2*00], mm0
+ movq [r0+2*04], mm4
+ movd [r0+2*08], mm1
+ movq [r0+2*36], mm5
+ movq [r0+2*46], mm6
+
+ PALIGNR xmm4, xmm4, 14, xmm3
+ movdq2q mm4, xmm4
+ PALIGNR xmm5, xmm5, 12, xmm3
+ movdq2q mm5, xmm5
+ PALIGNR xmm6, xmm6, 10, xmm3
+ movdq2q mm6, xmm6
+%ifnidn %1, sse2
+ PALIGNR xmm7, xmm7, 8, xmm3
+ movdq2q mm7, xmm7
+%else
+ movhlps xmm3, xmm7
+ punpcklqdq xmm7, xmm7
+ movdq2q mm7, xmm3
+%endif
+
+ punpckhwd xmm4, xmm5
+ punpckhwd xmm6, xmm7
+
+ movq mm0, mm4
+ movq mm1, mm5
+ movq mm3, mm7
+ punpcklwd mm7, mm6
+ psrlq mm6, 16
+ punpcklwd mm4, mm6
+ punpcklwd mm5, mm4
+ punpckhdq mm4, mm3
+ punpcklwd mm3, mm6
+ punpckhwd mm3, mm4
+ punpckhwd mm0, mm1
+ punpckldq mm4, mm0
+ punpckhdq mm0, mm6
+ pshufw mm4, mm4, 0x6c
+
+ movq [r0+2*14], mm4
+ movq [r0+2*25], mm0
+ movd [r0+2*54], mm7
+ movq [r0+2*56], mm5
+ movq [r0+2*60], mm3
+
+ punpckhdq xmm3, xmm0, xmm2
+ punpckldq xmm0, xmm2
+ punpckhdq xmm7, xmm4, xmm6
+ punpckldq xmm4, xmm6
+ pshufhw xmm0, xmm0, 0x1b
+ pshuflw xmm4, xmm4, 0x1b
+ pshufhw xmm3, xmm3, 0x1b
+ pshuflw xmm7, xmm7, 0x1b
+
+ movlps [r0+2*10], xmm0
+ movhps [r0+2*17], xmm0
+ movlps [r0+2*21], xmm3
+ movlps [r0+2*28], xmm4
+ movhps [r0+2*32], xmm3
+ movhps [r0+2*39], xmm4
+ movlps [r0+2*43], xmm7
+ movhps [r0+2*50], xmm7
+
+ RET
+%endmacro
+
+%ifndef HIGH_BIT_DEPTH
+INIT_XMM
+%define PALIGNR PALIGNR_MMX
+SCAN_8x8 sse2
+%define PALIGNR PALIGNR_SSSE3
+SCAN_8x8 ssse3
+%endif
+
+;-----------------------------------------------------------------------------
+; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
+;-----------------------------------------------------------------------------
+%macro SCAN_8x8_FRAME 6
+cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
+ mova m0, [r1]
+ mova m1, [r1+ 8*SIZEOF_DCTCOEF]
+ movu m2, [r1+14*SIZEOF_DCTCOEF]
+ movu m3, [r1+21*SIZEOF_DCTCOEF]
+ mova m4, [r1+28*SIZEOF_DCTCOEF]
+ punpckl%5 m5, m0, m1
+ psrl%3 m0, %2
+ punpckh%5 m6, m1, m0
+ punpckl%4 m5, m0
+ punpckl%4 m1, m1
+ punpckh%5 m1, m3
+ mova m7, [r1+52*SIZEOF_DCTCOEF]
+ mova m0, [r1+60*SIZEOF_DCTCOEF]
+ punpckh%5 m1, m2
+ punpckl%5 m2, m4
+ punpckh%5 m4, m3
+ punpckl%4 m3, m3
+ punpckh%5 m3, m2
+ mova [r0], m5
+ mova [r0+ 4*SIZEOF_DCTCOEF], m1
+ mova [r0+ 8*SIZEOF_DCTCOEF], m6
+ punpckl%5 m6, m0
+ punpckl%5 m6, m7
+ mova m1, [r1+32*SIZEOF_DCTCOEF]
+ movu m5, [r1+39*SIZEOF_DCTCOEF]
+ movu m2, [r1+46*SIZEOF_DCTCOEF]
+ movu [r0+35*SIZEOF_DCTCOEF], m3
+ movu [r0+47*SIZEOF_DCTCOEF], m4
+ punpckh%5 m7, m0
+ psll%3 m0, %2
+ punpckh%4 m3, m5, m5
+ punpckl%5 m5, m1
+ punpckh%5 m1, m2
+ mova [r0+52*SIZEOF_DCTCOEF], m6
+ movu [r0+13*SIZEOF_DCTCOEF], m5
+ movu m4, [r1+11*SIZEOF_DCTCOEF]
+ movu m6, [r1+25*SIZEOF_DCTCOEF]
+ punpckl%5 m5, m7
+ punpckl%5 m1, m3
+ punpckh%4 m0, m7
+ mova m3, [r1+ 4*SIZEOF_DCTCOEF]
+ movu m7, [r1+18*SIZEOF_DCTCOEF]
+ punpckl%5 m2, m5
+ movu [r0+25*SIZEOF_DCTCOEF], m1
+ mova m1, m4
+ mova m5, m6
+ punpckl%5 m4, m3
+ punpckl%5 m6, m7
+ punpckh%5 m1, m3
+ punpckh%5 m5, m7
+ punpckh%4 m3, m6, m4
+ punpckh%4 m7, m5, m1
+ punpckl%4 m6, m4
+ punpckl%4 m5, m1
+ movu m4, [r1+35*SIZEOF_DCTCOEF]
+ movu m1, [r1+49*SIZEOF_DCTCOEF]
+ pshuf%6 m6, m6, 0x1b
+ pshuf%6 m5, m5, 0x1b
+ mova [r0+60*SIZEOF_DCTCOEF], m0
+ mova [r0+56*SIZEOF_DCTCOEF], m2
+ movu m0, [r1+42*SIZEOF_DCTCOEF]
+ mova m2, [r1+56*SIZEOF_DCTCOEF]
+ movu [r0+17*SIZEOF_DCTCOEF], m3
+ mova [r0+32*SIZEOF_DCTCOEF], m7
+ movu [r0+10*SIZEOF_DCTCOEF], m6
+ movu [r0+21*SIZEOF_DCTCOEF], m5
+ punpckh%5 m3, m0, m4
+ punpckh%5 m7, m2, m1
+ punpckl%5 m0, m4
+ punpckl%5 m2, m1
+ punpckl%4 m4, m2, m0
+ punpckl%4 m1, m7, m3
+ punpckh%4 m2, m0
+ punpckh%4 m7, m3
+ pshuf%6 m2, m2, 0x1b
+ pshuf%6 m7, m7, 0x1b
+ mova [r0+28*SIZEOF_DCTCOEF], m4
+ movu [r0+43*SIZEOF_DCTCOEF], m1
+ movu [r0+39*SIZEOF_DCTCOEF], m2
+ movu [r0+50*SIZEOF_DCTCOEF], m7
+ RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
+INIT_AVX
+SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
+%else
+INIT_MMX
+SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
+%endif
+
+;-----------------------------------------------------------------------------
+; void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[4][4] )
+;-----------------------------------------------------------------------------
+%macro SCAN_4x4 5
+cglobal zigzag_scan_4x4_frame_%1, 2,2,8*(mmsize)/16
+ mova m0, [r1]
+ mova m1, [r1+ 4*SIZEOF_DCTCOEF]
+ mova m2, [r1+ 8*SIZEOF_DCTCOEF]
+ mova m3, [r1+12*SIZEOF_DCTCOEF]
+ punpckl%5 m4, m0, m1
+ mova m5, m1
+ mova m6, m2
+ mova m7, m3
+ psll%3 m3, %2
+ psrl%3 m0, %2
+ punpckl%4 m2, m2
+ punpckh%4 m1, m1
+ punpckl%5 m5, m3
+ punpckl%4 m4, m0
+ punpckh%5 m5, m2
+ punpckh%5 m0, m6
+ punpckh%5 m6, m7
+ punpckl%5 m1, m0
+ punpckh%4 m3, m6
+ mova [r0], m4
+ mova [r0+ 4*SIZEOF_DCTCOEF], m5
+ mova [r0+ 8*SIZEOF_DCTCOEF], m1
+ mova [r0+12*SIZEOF_DCTCOEF], m3
+ RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+SCAN_4x4 sse2, 4 , dq, qdq, dq
+INIT_AVX
+SCAN_4x4 avx , 4 , dq, qdq, dq
+%else
+INIT_MMX
+SCAN_4x4 mmx , 16, q , dq , wd
+
+;-----------------------------------------------------------------------------
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
+;-----------------------------------------------------------------------------
+%macro SCAN_4x4_FRAME 1
+cglobal zigzag_scan_4x4_frame_%1, 2,2
+ movdqa xmm1, [r1+16]
+ movdqa xmm0, [r1]
+ pshufb xmm1, [pb_scan4frameb]
+ pshufb xmm0, [pb_scan4framea]
+ psrldq xmm2, xmm1, 6
+ palignr xmm1, xmm0, 6
+ pslldq xmm0, 10
+ palignr xmm2, xmm0, 10
+ movdqa [r0], xmm1
+ movdqa [r0+16], xmm2
+ RET
+%endmacro
+
+INIT_XMM
+SCAN_4x4_FRAME ssse3
+INIT_AVX
+SCAN_4x4_FRAME avx
+%endif ; !HIGH_BIT_DEPTH
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+;-----------------------------------------------------------------------------
+; void zigzag_scan_4x4_field( int32_t level[16], int32_t dct[4][4] )
+;-----------------------------------------------------------------------------
+cglobal zigzag_scan_4x4_field_sse2, 2,3
+ movu m4, [r1+8]
+ pshufd m0, m4, 0xd2
+ mova m1, [r1+32]
+ mova m2, [r1+48]
+ movu [r0+8], m0
+ mova [r0+32], m1
+ mova [r0+48], m2
+ movq mm0, [r1]
+ movq [r0], mm0
+ movq mm0, [r1+24]
+ movq [r0+24], mm0
+ RET
+%else
+;-----------------------------------------------------------------------------
+; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
+cglobal zigzag_scan_4x4_field_mmxext, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
mov r2d, [r1+12]
mov [r0+12], r2d
RET
+%endif ; HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
+;-----------------------------------------------------------------------------
+; Output order:
+; 0 1 2 8 9 3 4 10
+; 16 11 5 6 7 12 17 24
+; 18 13 14 15 19 25 32 26
+; 20 21 22 23 27 33 40 34
+; 28 29 30 31 35 41 48 42
+; 36 37 38 39 43 49 50 44
+; 45 46 47 51 56 57 52 53
+; 54 55 58 59 60 61 62 63
+%undef SCAN_8x8
+%macro SCAN_8x8 6
+cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
+ mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
+ mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
+ mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
+ pshuf%2 m3, m0, 011111111b ; 03 03 03 03
+ movd r2, m2 ; 09 08
+ pshuf%2 m2, m2, 000111001b ; 08 11 10 09
+ punpckl%3 m3, m1 ; 05 03 04 03
+ pinsr%2 m0, r2d, 3 ; 08 02 01 00
+ punpckl%3 m4, m2, m3 ; 04 10 03 09
+ pshuf%2 m4, m4, 010110100b ; 10 04 03 09
+ mova [r0+ 0*SIZEOF_DCTCOEF], m0 ; 08 02 01 00
+ mova [r0+ 4*SIZEOF_DCTCOEF], m4 ; 10 04 03 09
+ mova m3, [r1+12*SIZEOF_DCTCOEF] ; 15 14 13 12
+ mova m5, [r1+16*SIZEOF_DCTCOEF] ; 19 18 17 16
+ punpckl%4 m6, m5 ; 17 16 XX XX
+ psrl%5 m1, %6 ; XX 07 06 05
+ punpckh%3 m6, m2 ; 08 17 11 16
+ punpckl%4 m6, m1 ; 06 05 11 16
+ mova [r0+ 8*SIZEOF_DCTCOEF], m6 ; 06 05 11 16
+ psrl%5 m1, %6 ; XX XX 07 06
+ punpckl%3 m1, m5 ; 17 07 16 06
+ mova m0, [r1+20*SIZEOF_DCTCOEF] ; 23 22 21 20
+ mova m2, [r1+24*SIZEOF_DCTCOEF] ; 27 26 25 24
+ punpckh%4 m1, m1 ; 17 07 17 07
+ punpckl%3 m6, m3, m2 ; 25 13 24 12
+ pextr%2 r2d, m5, 2
+ mova [r0+24*SIZEOF_DCTCOEF], m0 ; 23 22 21 20
+ punpckl%3 m1, m6 ; 24 17 12 07
+ mova [r0+12*SIZEOF_DCTCOEF], m1
+ pinsr%2 m3, r2d, 0 ; 15 14 13 18
+ mova [r0+16*SIZEOF_DCTCOEF], m3 ; 15 14 13 18
+ mova m7, [r1+28*SIZEOF_DCTCOEF]
+ mova m0, [r1+32*SIZEOF_DCTCOEF] ; 35 34 33 32
+ psrl%5 m5, %6*3 ; XX XX XX 19
+ pshuf%2 m1, m2, 011111001b ; 27 27 26 25
+ punpckl%3 m5, m0 ; 33 XX 32 19
+ psrl%5 m2, %6*3 ; XX XX XX 27
+ punpckl%3 m5, m1 ; 26 32 25 19
+ mova [r0+32*SIZEOF_DCTCOEF], m7
+ mova [r0+20*SIZEOF_DCTCOEF], m5 ; 26 32 25 19
+ mova m7, [r1+36*SIZEOF_DCTCOEF]
+ mova m1, [r1+40*SIZEOF_DCTCOEF] ; 43 42 41 40
+ pshuf%2 m3, m0, 011111001b ; 35 35 34 33
+ punpckl%3 m2, m1 ; 41 XX 40 27
+ mova [r0+40*SIZEOF_DCTCOEF], m7
+ punpckl%3 m2, m3 ; 34 40 33 27
+ mova [r0+28*SIZEOF_DCTCOEF], m2
+ mova m7, [r1+44*SIZEOF_DCTCOEF] ; 47 46 45 44
+ mova m2, [r1+48*SIZEOF_DCTCOEF] ; 51 50 49 48
+ psrl%5 m0, %6*3 ; XX XX XX 35
+ punpckl%3 m0, m2 ; 49 XX 48 35
+ pshuf%2 m3, m1, 011111001b ; 43 43 42 41
+ punpckl%3 m0, m3 ; 42 48 41 35
+ mova [r0+36*SIZEOF_DCTCOEF], m0
+ pextr%2 r2d, m2, 3 ; 51
+ psrl%5 m1, %6*3 ; XX XX XX 43
+ punpckl%3 m1, m7 ; 45 XX 44 43
+ psrl%5 m2, %6 ; XX 51 50 49
+ punpckl%3 m1, m2 ; 50 44 49 43
+ pshuf%2 m1, m1, 010110100b ; 44 50 49 43
+ mova [r0+44*SIZEOF_DCTCOEF], m1
+ psrl%5 m7, %6 ; XX 47 46 45
+ pinsr%2 m7, r2d, 3 ; 51 47 46 45
+ mova [r0+48*SIZEOF_DCTCOEF], m7
+ mova m0, [r1+56*SIZEOF_DCTCOEF] ; 59 58 57 56
+ mova m1, [r1+52*SIZEOF_DCTCOEF] ; 55 54 53 52
+ mova m7, [r1+60*SIZEOF_DCTCOEF]
+ punpckl%4 m2, m0, m1 ; 53 52 57 56
+ punpckh%4 m1, m0 ; 59 58 55 54
+ mova [r0+52*SIZEOF_DCTCOEF], m2
+ mova [r0+56*SIZEOF_DCTCOEF], m1
+ mova [r0+60*SIZEOF_DCTCOEF], m7
+ RET
+%endmacro
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+SCAN_8x8 sse4 , d, dq, qdq, dq, 4
+INIT_AVX
+SCAN_8x8 avx , d, dq, qdq, dq, 4
+%else
+INIT_MMX
+SCAN_8x8 mmxext, w, wd, dq , q , 16
+%endif
;-----------------------------------------------------------------------------
-; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
+%macro ZIGZAG_SUB_4x4 3
+%ifidn %1, ac
+cglobal zigzag_sub_4x4%1_%2_%3, 4,4,8
+%else
+cglobal zigzag_sub_4x4%1_%2_%3, 3,3,8
+%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE]
movd [r2+1*FDEC_STRIDE], xmm1
movd [r2+2*FDEC_STRIDE], xmm2
movd [r2+3*FDEC_STRIDE], xmm3
- picgetgot r1
punpckldq xmm0, xmm1
punpckldq xmm2, xmm3
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
- movlhps xmm0, xmm2
- movlhps xmm4, xmm6
- movdqa xmm7, [pb_zigzag4 GLOBAL]
+ punpcklqdq xmm0, xmm2
+ punpcklqdq xmm4, xmm6
+%ifidn %2, frame
+ movdqa xmm7, [pb_sub4frame]
+%else
+ movdqa xmm7, [pb_sub4field]
+%endif
pshufb xmm0, xmm7
pshufb xmm4, xmm7
pxor xmm6, xmm6
- movdqa xmm1, xmm0
- movdqa xmm5, xmm4
+ punpckhbw xmm1, xmm0, xmm6
+ punpckhbw xmm5, xmm4, xmm6
punpcklbw xmm0, xmm6
- punpckhbw xmm1, xmm6
punpcklbw xmm4, xmm6
- punpckhbw xmm5, xmm6
psubw xmm0, xmm4
psubw xmm1, xmm5
+%ifidn %1, ac
+ movd r2d, xmm0
+ pand xmm0, [pb_subacmask]
+%endif
movdqa [r0], xmm0
+ pxor xmm2, xmm2
movdqa [r0+16], xmm1
+ por xmm0, xmm1
+ pcmpeqb xmm0, xmm2
+ pmovmskb eax, xmm0
+%ifidn %1, ac
+ mov [r3], r2w
+%endif
+ sub eax, 0xffff
+ shr eax, 31
RET
+%endmacro
+
+INIT_XMM
+ZIGZAG_SUB_4x4 , frame, ssse3
+ZIGZAG_SUB_4x4 ac, frame, ssse3
+ZIGZAG_SUB_4x4 , field, ssse3
+ZIGZAG_SUB_4x4 ac, field, ssse3
+INIT_AVX
+ZIGZAG_SUB_4x4 , frame, avx
+ZIGZAG_SUB_4x4 ac, frame, avx
+ZIGZAG_SUB_4x4 , field, avx
+ZIGZAG_SUB_4x4 ac, field, avx
+
+;-----------------------------------------------------------------------------
+; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
+;-----------------------------------------------------------------------------
+%macro INTERLEAVE 2
+ mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
+ mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
+ mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
+ mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
+ TRANSPOSE4x4%2 0,1,2,3,4
+ mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
+ mova [r0+(%1+32)*SIZEOF_PIXEL], m1
+ mova [r0+(%1+64)*SIZEOF_PIXEL], m2
+ mova [r0+(%1+96)*SIZEOF_PIXEL], m3
+ packsswb m0, m1
+%if %1
+ por m6, m2
+ por m7, m3
+ por m5, m0
+%else
+ SWAP 5, 0
+ SWAP 6, 2
+ SWAP 7, 3
+%endif
+%endmacro
+
+%macro ZIGZAG_8x8_CAVLC 2
+cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8*(mmsize/16)
+ INTERLEAVE 0, %2
+ INTERLEAVE 8, %2
+ INTERLEAVE 16, %2
+ INTERLEAVE 24, %2
+ packsswb m6, m7
+ packsswb m5, m6
+ packsswb m5, m5
+ pxor m0, m0
+%ifdef HIGH_BIT_DEPTH
+ packsswb m5, m5
+%endif
+ pcmpeqb m5, m0
+ paddb m5, [pb_1]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+ZIGZAG_8x8_CAVLC sse2, D
+INIT_AVX
+ZIGZAG_8x8_CAVLC avx , D
+%else
+INIT_MMX
+ZIGZAG_8x8_CAVLC mmx , W
+%endif
+
+%macro INTERLEAVE_XMM 1
+ mova m0, [r1+%1*4+ 0]
+ mova m1, [r1+%1*4+16]
+ mova m4, [r1+%1*4+32]
+ mova m5, [r1+%1*4+48]
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 4, 5, 7
+ SBUTTERFLY wd, 0, 1, 6
+ SBUTTERFLY wd, 4, 5, 7
+ movq [r0+%1+ 0], m0
+ movhps [r0+%1+ 32], m0
+ movq [r0+%1+ 64], m1
+ movhps [r0+%1+ 96], m1
+ movq [r0+%1+ 8], m4
+ movhps [r0+%1+ 40], m4
+ movq [r0+%1+ 72], m5
+ movhps [r0+%1+104], m5
+%if %1
+ por m2, m0
+ por m3, m1
+ por m2, m4
+ por m3, m5
+%else
+ SWAP 0,2
+ SWAP 3,1
+ por m2, m4
+ por m3, m5
+%endif
+%endmacro
+
+%ifndef HIGH_BIT_DEPTH
+%macro ZIGZAG_8x8_CAVLC 1
+cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8
+ INTERLEAVE_XMM 0
+ INTERLEAVE_XMM 16
+ packsswb m2, m3
+ pxor m5, m5
+ packsswb m2, m2
+ packsswb m2, m2
+ pcmpeqb m5, m2
+ paddb m5, [pb_1]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
+ RET
+%endmacro
+
+INIT_XMM
+ZIGZAG_8x8_CAVLC sse2
+INIT_AVX
+ZIGZAG_8x8_CAVLC avx
+%endif ; !HIGH_BIT_DEPTH