;*****************************************************************************
-;* dct-a.asm: h264 encoder library
+;* dct-a.asm: x86 transform and zigzag
;*****************************************************************************
-;* Copyright (C) 2003-2008 x264 project
+;* Copyright (C) 2003-2010 x264 project
;*
;* Authors: Holger Lubitz <holger@lubitz.org>
;* Loren Merritt <lorenm@u.washington.edu>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Min Chen <chenm001.163.com>
+;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%endmacro
SECTION_RODATA
-pw_32_0: times 4 dw 32
- times 4 dw 0
-pw_32: times 8 dw 32
-pw_8000: times 8 dw 0x8000
-hsub_mul: times 8 db 1, -1
-
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
-pb_1: times 16 db 1
-pw_1: times 8 dw 1
SECTION .text
+cextern pw_32_0
+cextern pw_32
+cextern pw_8000
+cextern hsub_mul
+cextern pb_1
+cextern pw_1
+
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_dct4x4dc_mmx( int16_t d[4][4] )
+; void dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_dct4x4dc_mmx, 1,1
+cglobal dct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
- movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
+ movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
SUMSUB_BADC m1, m0, m3, m2, m4
RET
;-----------------------------------------------------------------------------
-; void x264_idct4x4dc_mmx( int16_t d[4][4] )
+; void idct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_idct4x4dc_mmx, 1,1
+cglobal idct4x4dc_mmx, 1,1
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
movq [r0+24], m3
RET
+%ifdef X264_HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void sub4x4_dct( int32_t dct[4][4], uint16_t *pix1, uint16_t *pix2 )
+;-----------------------------------------------------------------------------
+cglobal sub4x4_dct_mmx, 3,3
+.skip_prologue:
+ LOAD_DIFF m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
+ LOAD_DIFF m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
+ LOAD_DIFF m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
+ LOAD_DIFF m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
+ DCT4_1D 0,1,2,3,4
+ TRANSPOSE4x4W 0,1,2,3,4
+ DCT4_1D 0,1,2,3,4
+ STORE_DIFF m0, m4, m5, [r0+ 0], [r0+ 8]
+ STORE_DIFF m1, m4, m5, [r0+16], [r0+24]
+ STORE_DIFF m2, m4, m5, [r0+32], [r0+40]
+ STORE_DIFF m3, m4, m5, [r0+48], [r0+56]
+ RET
+%endif ; X264_HIGH_BIT_DEPTH
+
+%ifndef X264_HIGH_BIT_DEPTH
%macro SUB_DCT4 1
;-----------------------------------------------------------------------------
-; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
-cglobal x264_sub4x4_dct_%1, 3,3
+cglobal sub4x4_dct_%1, 3,3
%ifidn %1, mmx
.skip_prologue:
LOAD_DIFF m0, m4, m5, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m4, m5, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m4, m5, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
%else
- mova m5, [hsub_mul GLOBAL]
+ mova m5, [hsub_mul]
LOAD_DIFF8x4_SSSE3 0, 3, 1, 2, 4, 5, r1, r2
%endif
DCT4_1D 0,1,2,3,4
SUB_DCT4 mmx
SUB_DCT4 ssse3
+%endif ; !X264_HIGH_BIT_DEPTH
+%ifndef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
-; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
+; void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_add4x4_idct_mmx, 2,2
+cglobal add4x4_idct_mmx, 2,2
pxor m7, m7
.skip_prologue:
movq m1, [r1+ 8]
movq m0, [r1+ 0]
IDCT4_1D 0,1,2,3,4,5
TRANSPOSE4x4W 0,1,2,3,4
- paddw m0, [pw_32 GLOBAL]
+ paddw m0, [pw_32]
IDCT4_1D 0,1,2,3,4,5
STORE_DIFF m0, m4, m7, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m4, m7, [r0+1*FDEC_STRIDE]
RET
INIT_XMM
-cglobal x264_add4x4_idct_sse4, 2,2,6
+cglobal add4x4_idct_sse4, 2,2,6
mova m0, [r1+0x00] ; row1/row0
mova m2, [r1+0x10] ; row3/row2
mova m1, m0 ; row1/row0
punpckhdq m2, m0
SWAP 0, 1
- mova m1, [pw_32_0 GLOBAL]
+ mova m1, [pw_32_0]
paddw m1, m0 ; row1/row0 corrected
psraw m0, 1 ; row1>>1/...
mova m3, m2 ; row3/row2
movd [r0+FDEC_STRIDE*2], m0
pextrd [r0+FDEC_STRIDE*3], m0, 1
RET
+%endif ; !X264_HIGH_BIT_DEPTH
INIT_MMX
;-----------------------------------------------------------------------------
-; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
+; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
-cglobal %1, 3,3,11
+cglobal %1, 3,3,11*(mmsize/16)
+%ifndef X264_HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
%else
add r2, 4*FDEC_STRIDE
- mova m7, [hsub_mul GLOBAL]
+ mova m7, [hsub_mul]
%endif
+%endif ; !X264_HIGH_BIT_DEPTH
.skip_prologue:
%ifdef WIN64
sub rsp, 8
%endmacro
;-----------------------------------------------------------------------------
-; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
+; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
-cglobal %1, 2,2,11
+cglobal %1, 2,2,11*(mmsize/16)
pxor m7, m7
%if mmsize==16
add r0, 4*FDEC_STRIDE
%endif
%endmacro
+%ifdef X264_HIGH_BIT_DEPTH
+INIT_MMX
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 64, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 64, 16, 8, 8
+%else ; !X264_HIGH_BIT_DEPTH
%ifndef ARCH_X86_64
-SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
-ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
-ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
-
-cextern x264_sub8x8_dct8_mmx.skip_prologue
-cextern x264_add8x8_idct8_mmx.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub8x8_dct_mmx, sub4x4_dct_mmx.skip_prologue, 32, 4, 0, 0
+ADD_NxN_IDCT add8x8_idct_mmx, add4x4_idct_mmx.skip_prologue, 32, 4, 0, 0
+SUB_NxN_DCT sub16x16_dct_mmx, sub8x8_dct_mmx.skip_prologue, 32, 8, 4, 4
+ADD_NxN_IDCT add16x16_idct_mmx, add8x8_idct_mmx.skip_prologue, 32, 8, 4, 4
+
+cextern sub8x8_dct8_mmx.skip_prologue
+cextern add8x8_idct8_mmx.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_mmx, sub8x8_dct8_mmx.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx.skip_prologue, 128, 8, 0, 0
%endif
INIT_XMM
-cextern x264_sub8x8_dct_sse2.skip_prologue
-cextern x264_sub8x8_dct_ssse3.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
-SUB_NxN_DCT x264_sub16x16_dct_ssse3, x264_sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
-cextern x264_add8x8_idct_sse2.skip_prologue
-ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
+cextern sub8x8_dct_sse2.skip_prologue
+cextern sub8x8_dct_ssse3.skip_prologue
+SUB_NxN_DCT sub16x16_dct_sse2, sub8x8_dct_sse2.skip_prologue, 128, 8, 0, 0
+SUB_NxN_DCT sub16x16_dct_ssse3, sub8x8_dct_ssse3.skip_prologue, 128, 8, 0, 0
+cextern add8x8_idct_sse2.skip_prologue
+ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2.skip_prologue, 2*64, 8, 0, 0
-cextern x264_sub8x8_dct8_sse2.skip_prologue
-cextern x264_add8x8_idct8_sse2.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
-ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_sse2.skip_prologue
+cextern add8x8_idct8_sse2.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2.skip_prologue, 128, 8, 0, 0
+ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2.skip_prologue, 128, 8, 0, 0
-cextern x264_sub8x8_dct8_ssse3.skip_prologue
-SUB_NxN_DCT x264_sub16x16_dct8_ssse3, x264_sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+cextern sub8x8_dct8_ssse3.skip_prologue
+SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3.skip_prologue, 128, 8, 0, 0
+%endif ; X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
movq [%3+FDEC_STRIDE*3], %1
%endmacro
-cglobal x264_add8x8_idct_dc_mmx, 2,2
+cglobal add8x8_idct_dc_mmx, 2,2
movq mm0, [r1]
pxor mm1, mm1
add r0, FDEC_STRIDE*4
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
ADD_DC mm2, mm3, r0
RET
-cglobal x264_add8x8_idct_dc_ssse3, 2,2
+cglobal add8x8_idct_dc_ssse3, 2,2
movq xmm0, [r1]
pxor xmm1, xmm1
add r0, FDEC_STRIDE*4
- paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
psraw xmm0, 6
psubw xmm1, xmm0
- movdqa xmm5, [pb_idctdc_unpack GLOBAL]
+ movdqa xmm5, [pb_idctdc_unpack]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
pshufb xmm0, xmm5
movhps [r0+FDEC_STRIDE* 3], xmm5
RET
-cglobal x264_add16x16_idct_dc_mmx, 2,3
+cglobal add16x16_idct_dc_mmx, 2,3
mov r2, 4
.loop:
movq mm0, [r1]
pxor mm1, mm1
- paddw mm0, [pw_32 GLOBAL]
+ paddw mm0, [pw_32]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
-cglobal x264_add16x16_idct_dc_sse2, 2,2,8
+cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
punpcklwd xmm2, xmm2
pxor xmm1, xmm1
pxor xmm3, xmm3
- paddw xmm0, [pw_32 GLOBAL]
- paddw xmm2, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
+ paddw xmm2, [pw_32]
psraw xmm0, 6
psraw xmm2, 6
psubw xmm1, xmm0
IDCT_DC_STORE 0, xmm2, xmm3
ret
-cglobal x264_add16x16_idct_dc_ssse3, 2,2,8
+cglobal add16x16_idct_dc_ssse3, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
%ifdef WIN64
movdqa xmm0, [r1]
add r1, 16
pxor xmm1, xmm1
- paddw xmm0, [pw_32 GLOBAL]
+ paddw xmm0, [pw_32]
psraw xmm0, 6
psubw xmm1, xmm0
- movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
- movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
+ movdqa xmm5, [ pb_idctdc_unpack]
+ movdqa xmm6, [pb_idctdc_unpack2]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movdqa xmm2, xmm0
movq m1, m2
punpckldq m2, m3
punpckhdq m1, m3
- psadbw %1, m7
- psadbw %2, m7
- psadbw m2, m7
- psadbw m1, m7
+ pxor m3, m3
+ psadbw %1, m3
+ psadbw %2, m3
+ psadbw m2, m3
+ psadbw m1, m3
psubw %1, m2
psubw %2, m1
%endmacro
+%macro DCT2x2 2 ; reg s1/s0 (!=m1), reg s3/s2
+ pshufw mm1, %1, 10100000b ; s1 s1 s0 s0
+ pshufw mm0, %2, 10110001b ; s3 __ s2 __
+ paddw mm1, %2 ; s1 s13 s0 s02
+ psubw mm1, mm0 ; d13 s13 d02 s02
+ pshufw mm0, mm1, 01000100b ; d02 s02 d02 s02
+ psrlq mm1, 32 ; __ __ d13 s13
+ paddw mm0, mm1 ; d02 s02 d02+d13 s02+s13
+ psllq mm1, 32 ; d13 s13
+ psubw mm0, mm1 ; d02-d13 s02-s13 d02+d13 s02+s13
+%endmacro
+
INIT_MMX
-cglobal x264_sub8x8_dct_dc_mmxext, 3,3
- pxor m7, m7
- call .loop
- add r1, FENC_STRIDE*4
- add r2, FDEC_STRIDE*4
- add r0, 4
-.loop:
+cglobal sub8x8_dct_dc_mmxext, 3,3
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
paddw m4, m6
- punpcklwd m0, m4
- movd [r0], m0
+ punpckldq m0, m4
+ add r1, FENC_STRIDE*4
+ add r2, FDEC_STRIDE*4
+ DCTDC_2ROW_MMX m7, m4, 0
+ DCTDC_2ROW_MMX m5, m6, 2
+ paddw m7, m5
+ paddw m4, m6
+ punpckldq m7, m4
+ DCT2x2 m0, m7
+ movq [r0], m0
ret
INIT_XMM
%endif
%endmacro
-cglobal x264_sub8x8_dct_dc_sse2, 3,3,8
+cglobal sub8x8_dct_dc_sse2, 3,3,8
pxor m7, m7
DCTDC_2ROW_SSE2 0, 0, m4
DCTDC_2ROW_SSE2 2, 1, m4
add r1, FENC_STRIDE*4
add r2, FDEC_STRIDE*4
- psubq m4, m6
+ psubd m4, m6
DCTDC_2ROW_SSE2 0, 0, m5
DCTDC_2ROW_SSE2 2, 1, m5
- psubq m5, m6
+ psubd m5, m6
packssdw m4, m5
- packssdw m4, m4
- movq [r0], m4
+ movhlps m5, m4
+ movdq2q mm0, m4
+ movdq2q mm7, m5
+ DCT2x2 mm0, mm7
+ movq [r0], mm0
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
-cglobal x264_zigzag_scan_8x8_frame_%1, 2,2,8
+cglobal zigzag_scan_8x8_frame_%1, 2,2,8
movdqa xmm0, [r1]
movdqa xmm1, [r1+16]
movdq2q mm0, xmm0
SCAN_8x8 ssse3
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
+cglobal zigzag_scan_8x8_frame_mmxext, 2,2
movq mm0, [r1]
movq mm1, [r1+2*8]
movq mm2, [r1+2*14]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
+cglobal zigzag_scan_4x4_frame_mmx, 2,2
movq mm0, [r1]
movq mm1, [r1+8]
movq mm2, [r1+16]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
-cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
+cglobal zigzag_scan_4x4_frame_ssse3, 2,2
movdqa xmm1, [r1+16]
movdqa xmm0, [r1]
- pshufb xmm1, [pb_scan4frameb GLOBAL]
- pshufb xmm0, [pb_scan4framea GLOBAL]
+ pshufb xmm1, [pb_scan4frameb]
+ pshufb xmm0, [pb_scan4framea]
movdqa xmm2, xmm1
psrldq xmm1, 6
palignr xmm2, xmm0, 6
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
+; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
-cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
+cglobal zigzag_scan_4x4_field_mmxext, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[8][8] )
+; void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
; Output order:
; 45 46 47 51 56 57 52 53
; 54 55 58 59 60 61 62 63
-cglobal x264_zigzag_scan_8x8_field_mmxext, 2,3
+cglobal zigzag_scan_8x8_field_mmxext, 2,3
movq mm0, [r1+2*0] ; 03 02 01 00
movq mm1, [r1+2*4] ; 07 06 05 04
movq mm2, [r1+2*8] ; 11 10 09 08
RET
;-----------------------------------------------------------------------------
-; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+; void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
%macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 4,4,8
%else
-cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
+cglobal zigzag_sub_4x4%1_%2_ssse3, 3,3,8
%endif
movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE]
punpcklqdq xmm0, xmm2
punpcklqdq xmm4, xmm6
%ifidn %2, frame
- movdqa xmm7, [pb_sub4frame GLOBAL]
+ movdqa xmm7, [pb_sub4frame]
%else
- movdqa xmm7, [pb_sub4field GLOBAL]
+ movdqa xmm7, [pb_sub4field]
%endif
pshufb xmm0, xmm7
pshufb xmm4, xmm7
psubw xmm1, xmm5
%ifidn %1, ac
movd r2d, xmm0
- pand xmm0, [pb_subacmask GLOBAL]
+ pand xmm0, [pb_subacmask]
%endif
movdqa [r0], xmm0
pxor xmm2, xmm2
ZIGZAG_SUB_4x4 ac, field
;-----------------------------------------------------------------------------
-; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
+; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
;-----------------------------------------------------------------------------
%macro INTERLEAVE 1
%endmacro
INIT_MMX
-cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
+cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
INTERLEAVE 0
INTERLEAVE 8
INTERLEAVE 16
packsswb m5, m5
pxor m0, m0
pcmpeqb m5, m0
- paddb m5, [pb_1 GLOBAL]
+ paddb m5, [pb_1]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
%endmacro
INIT_XMM
-cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
+cglobal zigzag_interleave_8x8_cavlc_sse2, 3,3,8
INTERLEAVE_XMM 0
INTERLEAVE_XMM 16
packsswb m2, m3
packsswb m2, m2
packsswb m2, m2
pcmpeqb m5, m2
- paddb m5, [pb_1 GLOBAL]
+ paddb m5, [pb_1]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16