These changes were split out of the cpuflags commit because they change the output executable.
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
-cglobal %1, 3,3,11*(mmsize/16)
+cglobal %1, 3,3,11
%ifndef HIGH_BIT_DEPTH
%if mmsize == 8
pxor m7, m7
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6-7
%ifdef HIGH_BIT_DEPTH
-cglobal %1, 2,2,6*(mmsize/16)
+cglobal %1, 2,2,6
%else
-cglobal %1, 2,2,11*(mmsize/16)
+cglobal %1, 2,2,11
pxor m7, m7
%endif
%if mmsize==16
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
+INIT_XMM
cglobal add16x16_idct_dc_sse2, 2,2,8
call .loop
add r0, FDEC_STRIDE*4
; void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8_FRAME 5
-cglobal zigzag_scan_8x8_frame, 2,2,8*(mmsize/16)
+cglobal zigzag_scan_8x8_frame, 2,2,8
mova m0, [r1]
mova m1, [r1+ 8*SIZEOF_DCTCOEF]
movu m2, [r1+14*SIZEOF_DCTCOEF]
; 54 55 58 59 60 61 62 63
%undef SCAN_8x8
%macro SCAN_8x8 5
-cglobal zigzag_scan_8x8_field, 2,3,8*(mmsize/16)
+cglobal zigzag_scan_8x8_field, 2,3,8
mova m0, [r1+ 0*SIZEOF_DCTCOEF] ; 03 02 01 00
mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04
mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08
%endmacro
%macro ZIGZAG_8x8_CAVLC 1
-cglobal zigzag_interleave_8x8_cavlc, 3,3,8*(mmsize/16)
+cglobal zigzag_interleave_8x8_cavlc, 3,3,8
INTERLEAVE 0, %1
INTERLEAVE 8, %1
INTERLEAVE 16, %1
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma, 5,5,8*(mmsize/16)
+cglobal deblock_v_luma, 5,5,8
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
ADD rsp, pad
RET
-cglobal deblock_h_luma, 5,6,8*(mmsize/16)
+cglobal deblock_h_luma, 5,6,8
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_luma_intra, 4,7,8*(mmsize/16)
+cglobal deblock_v_luma_intra, 4,7,8
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7,8*(mmsize/16)
+cglobal deblock_h_luma_intra, 4,7,8
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
mova [r0+2*r1], m2
%endmacro
-%macro DEBLOCK_CHROMA 1
+%macro DEBLOCK_CHROMA 0
+cglobal deblock_inter_body
+ RESET_MM_PERMUTATION
+ LOAD_AB m4, m5, r2, r3
+ LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
+ pxor m4, m4
+ LOAD_TC m6, r4
+ pmaxsw m6, m4
+ pand m7, m6
+ DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ ret
+
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma, 7,7,8*(mmsize/16)
+cglobal deblock_v_chroma, 7,7,8
FIX_STRIDES r1
mov r5, r0
sub r0, r1
mov r6, 32/mmsize
.loop:
CHROMA_V_LOAD r5
- call deblock_inter_body_%1
+ call deblock_inter_body
CHROMA_V_STORE
add r0, mmsize
add r5, mmsize
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma, 5,7,8*(mmsize/16)
+cglobal deblock_h_chroma, 5,7,8
add r1, r1
mov r5, 32/mmsize
%if mmsize == 16
%endif
.loop:
CHROMA_H_LOAD r6
- call deblock_inter_body_%1
+ call deblock_inter_body
CHROMA_H_STORE r6
lea r0, [r0+r1*(mmsize/4)]
add r4, mmsize/8
jg .loop
REP_RET
-deblock_inter_body_%1:
+
+cglobal deblock_intra_body
RESET_MM_PERMUTATION
LOAD_AB m4, m5, r2, r3
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
- pxor m4, m4
- LOAD_TC m6, r4
- pmaxsw m6, m4
- pand m7, m6
- DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
+ CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
ret
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra, 4,6,8*(mmsize/16)
+cglobal deblock_v_chroma_intra, 4,6,8
add r1, r1
mov r5, 32/mmsize
movd m5, r3
SPLATW m5, m5
.loop:
CHROMA_V_LOAD r4
- call deblock_intra_body_%1
+ call deblock_intra_body
CHROMA_V_STORE
add r0, mmsize
add r4, mmsize
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra, 4,6,8*(mmsize/16)
+cglobal deblock_h_chroma_intra, 4,6,8
add r1, r1
mov r4, 32/mmsize
%if mmsize == 16
%endif
.loop:
CHROMA_H_LOAD r5
- call deblock_intra_body_%1
+ call deblock_intra_body
CHROMA_H_STORE r5
lea r0, [r0+r1*(mmsize/4)]
dec r4
jg .loop
REP_RET
-
-deblock_intra_body_%1:
- RESET_MM_PERMUTATION
- LOAD_AB m4, m5, r2, r3
- LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
- CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
- ret
%endmacro
%ifndef ARCH_X86_64
INIT_MMX mmx2
-DEBLOCK_CHROMA mmx2
+DEBLOCK_CHROMA
%endif
INIT_XMM sse2
-DEBLOCK_CHROMA sse2
+DEBLOCK_CHROMA
INIT_XMM avx
-DEBLOCK_CHROMA avx
+DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
%define t5 r5
%define t6 r6
-%macro DEBLOCK_CHROMA 1
+%macro DEBLOCK_CHROMA 0
+cglobal chroma_inter_body
+ LOAD_MASK r2d, r3d
+ movd m6, [r4] ; tc0
+ punpcklbw m6, m6
+ punpcklbw m6, m6
+ pand m7, m6
+ DEBLOCK_P0_Q0
+ ret
+
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
mova m1, [t5+r1]
mova m2, [r0]
mova m3, [r0+r1]
- call chroma_inter_body_%1
+ call chroma_inter_body
mova [t5+r1], m1
mova [r0], m2
CHROMA_V_LOOP 1
cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
- call chroma_inter_body_%1
+ call chroma_inter_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 1
RET
-
-ALIGN 16
-RESET_MM_PERMUTATION
-chroma_inter_body_%1:
- LOAD_MASK r2d, r3d
- movd m6, [r4] ; tc0
- punpcklbw m6, m6
- punpcklbw m6, m6
- pand m7, m6
- DEBLOCK_P0_Q0
- ret
%endmacro ; DEBLOCK_CHROMA
INIT_XMM sse2
-DEBLOCK_CHROMA sse2
+DEBLOCK_CHROMA
INIT_XMM avx
-DEBLOCK_CHROMA avx
+DEBLOCK_CHROMA
%ifndef ARCH_X86_64
INIT_MMX mmx2
-DEBLOCK_CHROMA mmx2
+DEBLOCK_CHROMA
%endif
%define t5 r4
%define t6 r5
-%macro DEBLOCK_CHROMA_INTRA 1
+%macro DEBLOCK_CHROMA_INTRA 0
+cglobal chroma_intra_body
+ LOAD_MASK r2d, r3d
+ mova m5, m1
+ mova m6, m2
+ CHROMA_INTRA_P0 m1, m0, m3
+ CHROMA_INTRA_P0 m2, m3, m0
+ psubb m1, m5
+ psubb m2, m6
+ pand m1, m7
+ pand m2, m7
+ paddb m1, m5
+ paddb m2, m6
+ ret
+
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
mova m1, [t5+r1]
mova m2, [r0]
mova m3, [r0+r1]
- call chroma_intra_body_%1
+ call chroma_intra_body
mova [t5+r1], m1
mova [r0], m2
CHROMA_V_LOOP 0
cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
- call chroma_intra_body_%1
+ call chroma_intra_body
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
CHROMA_H_LOOP 0
RET
-
-ALIGN 16
-RESET_MM_PERMUTATION
-chroma_intra_body_%1:
- LOAD_MASK r2d, r3d
- mova m5, m1
- mova m6, m2
- CHROMA_INTRA_P0 m1, m0, m3
- CHROMA_INTRA_P0 m2, m3, m0
- psubb m1, m5
- psubb m2, m6
- pand m1, m7
- pand m2, m7
- paddb m1, m5
- paddb m2, m6
- ret
%endmacro ; DEBLOCK_CHROMA_INTRA
INIT_XMM sse2
-DEBLOCK_CHROMA_INTRA sse2
+DEBLOCK_CHROMA_INTRA
INIT_XMM avx
-DEBLOCK_CHROMA_INTRA avx
+DEBLOCK_CHROMA_INTRA
%ifndef ARCH_X86_64
INIT_MMX mmx2
-DEBLOCK_CHROMA_INTRA mmx2
+DEBLOCK_CHROMA_INTRA
%endif
%endif ; !HIGH_BIT_DEPTH
%endif
%macro WEIGHTER 1
- cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
+ cglobal mc_weight_w%1, NUMREGS, NUMREGS, XMMREGS
FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
; uint16_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W_ONE 1
-cglobal pixel_avg2_w%1, 6,7,4*(mmsize/16)
+cglobal pixel_avg2_w%1, 6,7,4
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
%endmacro
%macro AVG2_W_TWO 3
-cglobal pixel_avg2_w%1, 6,7,8*(mmsize/16)
+cglobal pixel_avg2_w%1, 6,7,8
sub r4, r2
lea r6, [r4+r3*2]
.height_loop:
; pixel copy
;=============================================================================
-%macro COPY4 2-*
+%macro COPY1 2
movu m0, [r2]
movu m1, [r2+r3]
movu m2, [r2+r3*2]
mova [r0+%1], m3
%endmacro
-%macro COPY_ONE 4
- COPY4 %1, %2
+%macro COPY2 2-4 0, 1
+ movu m0, [r2+%3*mmsize]
+ movu m1, [r2+%4*mmsize]
+ movu m2, [r2+r3+%3*mmsize]
+ movu m3, [r2+r3+%4*mmsize]
+ movu m4, [r2+r3*2+%3*mmsize]
+ movu m5, [r2+r3*2+%4*mmsize]
+ movu m6, [r2+%2+%3*mmsize]
+ movu m7, [r2+%2+%4*mmsize]
+ mova [r0+%3*mmsize], m0
+ mova [r0+%4*mmsize], m1
+ mova [r0+r1+%3*mmsize], m2
+ mova [r0+r1+%4*mmsize], m3
+ mova [r0+r1*2+%3*mmsize], m4
+ mova [r0+r1*2+%4*mmsize], m5
+ mova [r0+%1+%3*mmsize], m6
+ mova [r0+%1+%4*mmsize], m7
%endmacro
-%macro COPY_TWO 4
- movu m0, [r2+%3]
- movu m1, [r2+%4]
- movu m2, [r2+r3+%3]
- movu m3, [r2+r3+%4]
- movu m4, [r2+r3*2+%3]
- movu m5, [r2+r3*2+%4]
- movu m6, [r2+%2+%3]
- movu m7, [r2+%2+%4]
- mova [r0+%3], m0
- mova [r0+%4], m1
- mova [r0+r1+%3], m2
- mova [r0+r1+%4], m3
- mova [r0+r1*2+%3], m4
- mova [r0+r1*2+%4], m5
- mova [r0+%1+%3], m6
- mova [r0+%1+%4], m7
+%macro COPY4 2
+ COPY2 %1, %2, 0, 1
+ COPY2 %1, %2, 2, 3
%endmacro
;-----------------------------------------------------------------------------
%define mova movd
%define movu movd
%endif
- COPY4 r4, r5
+ COPY1 r4, r5
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
.end:
- COPY4 r4, r5
+ COPY1 r4, r5
RET
-%ifdef HIGH_BIT_DEPTH
-cglobal mc_copy_w16_mmx, 5,7
+%macro MC_COPY 1
+%assign %%w %1*SIZEOF_PIXEL/mmsize
+%if %%w > 0
+cglobal mc_copy_w%1, 5,7,8*(%%w/2)
FIX_STRIDES r1, r3
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
- COPY_TWO r5, r6, mmsize*0, mmsize*1
- COPY_TWO r5, r6, mmsize*2, mmsize*3
- sub r4d, 4
+ COPY %+ %%w r5, r6
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
- jg .height_loop
- REP_RET
-
-%macro MC_COPY 2
-cglobal mc_copy_w%2, 5,7,%2-8
- FIX_STRIDES r1, r3
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- COPY_%1 r5, r6, 0, mmsize
sub r4d, 4
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
jg .height_loop
REP_RET
+%endif
%endmacro
INIT_MMX mmx
-MC_COPY TWO, 8
-INIT_XMM sse2
-MC_COPY ONE, 8
-MC_COPY TWO, 16
-INIT_XMM aligned, sse2
-MC_COPY TWO, 16
-%endif ; HIGH_BIT_DEPTH
-
-%ifndef HIGH_BIT_DEPTH
-%macro MC_COPY 2
-cglobal mc_copy_w%2, 5,7
- lea r6, [r3*3]
- lea r5, [r1*3]
-.height_loop:
- %1 r5, r6, 0, mmsize
- lea r2, [r2+r3*4]
- lea r0, [r0+r1*4]
- sub r4d, 4
- jg .height_loop
- REP_RET
-%endmacro
-
-INIT_MMX mmx
-MC_COPY COPY4, 8
-MC_COPY COPY_TWO, 16
+MC_COPY 8
+MC_COPY 16
INIT_XMM sse2
-MC_COPY COPY4, 16
-; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
-; but with SSE3 the overhead is zero, so there's no reason not to include it.
-INIT_XMM sse3
-MC_COPY COPY4, 16
+MC_COPY 8
+MC_COPY 16
INIT_XMM aligned, sse2
-MC_COPY COPY4, 16
-%endif ; !HIGH_BIT_DEPTH
+MC_COPY 16
; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
;-----------------------------------------------------------------------------
%macro HPEL_FILTER 0
-cglobal hpel_filter_v, 5,6,11*(mmsize/16)
+cglobal hpel_filter_v, 5,6,11
FIX_STRIDES r3d, r4d
%ifdef WIN64
movsxd r4, r4d
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_c, 3,3,10*(mmsize/16)
+cglobal hpel_filter_c, 3,3,10
add r2, r2
add r0, r2
lea r1, [r1+r2]
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_h, 3,4,8*(mmsize/16)
+cglobal hpel_filter_h, 3,4,8
%define src r1+r2
add r2, r2
add r0, r2
;-----------------------------------------------------------------------------
; void hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
;-----------------------------------------------------------------------------
+INIT_MMX
cglobal hpel_filter_c_mmx2, 3,3
add r0, r2
lea r1, [r1+r2*2]
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 0
-cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9))*(mmsize/16) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
%ifdef HIGH_BIT_DEPTH
shl dword r6m, 1
FIX_STRIDES r5d
void x264_mc_copy_w8_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
-void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_ref_mmx2( uint8_t *, int, int );
; int pixel_ssd_MxN( uint16_t *, int, uint16_t *, int )
;-----------------------------------------------------------------------------
%macro SSD_ONE 2
-cglobal pixel_ssd_%1x%2, 4,5,6*(mmsize/16)
+cglobal pixel_ssd_%1x%2, 4,5,6
mov r4, %1*%2/mmsize
pxor m0, m0
.loop
.startloop:
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3
-%if cpuflag(ssse3) ; FIXME wrong, but correcting this modifies the binary
PROLOGUE 0,0,8
-%else
- PROLOGUE 0,0,8*(mmsize/16)
-%endif
%else
PROLOGUE 0,5
DECLARE_REG_TMP 1,2,3,4
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
%macro SSD_NV12 0
-cglobal pixel_ssd_nv12_core, 6,7,7*(mmsize/16)
+cglobal pixel_ssd_nv12_core, 6,7,7
shl r4d, 2
FIX_STRIDES r1, r3
add r0, r4
ABS2 m10, m11, m12, m13
paddusw m8, m10
paddusw m9, m11
-%ifidn cpuname, ssse3
+%if cpuflag(ssse3)
pabsw m10, m6
pabsw m11, m7
pabsw m15, m1
%endrep
RET
-cglobal predict_4x4_vr, 1,1,6*(mmsize/16)
+cglobal predict_4x4_vr, 1,1,6
movh m0, [r0-1*FDEC_STRIDEB] ; ........t3t2t1t0
mova m5, m0
%ifdef HIGH_BIT_DEPTH
movh [r0+3*FDEC_STRIDEB], m3
RET
-cglobal predict_4x4_hd, 1,1,6*(mmsize/16)
+cglobal predict_4x4_hd, 1,1,6
movh m0, [r0-1*FDEC_STRIDEB-4*SIZEOF_PIXEL] ; lt ..
%ifdef HIGH_BIT_DEPTH
movh m1, [r0-1*FDEC_STRIDEB]
; void predict_4x4_vl( pixel *src )
;-----------------------------------------------------------------------------
%macro PREDICT_4x4_V1 3
-cglobal predict_4x4_vl, 1,1,6*(mmsize/16)
+cglobal predict_4x4_vl, 1,1,6
movu m1, [r0-FDEC_STRIDEB]
psrl%1 m3, m1, %2
psrl%1 m2, m1, %2*2
;-----------------------------------------------------------------------------
;void predict_8x8_filter( pixel *src, pixel edge[33], int i_neighbor, int i_filters )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_filter, 4,5,7*(mmsize/16)
+cglobal predict_8x8_filter, 4,5,7
add r0, 0x58*SIZEOF_PIXEL
%define src r0-0x58*SIZEOF_PIXEL
%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
; void predict_8x8_ddl( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
-cglobal predict_8x8_ddl, 2,2,8*(mmsize/16)
+cglobal predict_8x8_ddl, 2,2,8
mova m5, [r1+16*SIZEOF_PIXEL]
movu m2, [r1+17*SIZEOF_PIXEL]
movu m3, [r1+23*SIZEOF_PIXEL]
; void predict_8x8_ddr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%if avx_enabled == 0
-cglobal predict_8x8_ddr, 2,2,7*(mmsize/16)
+cglobal predict_8x8_ddr, 2,2,7
movu m1, [r1+ 7*SIZEOF_PIXEL]
movu m2, [r1+ 9*SIZEOF_PIXEL]
movu m3, [r1+15*SIZEOF_PIXEL]
; void predict_8x8_hu( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HU 5
-cglobal predict_8x8_hu, 2,2,8*(mmsize/16)
+cglobal predict_8x8_hu, 2,2,8
movu m1, [r1+7*SIZEOF_PIXEL] ; l0 l1 l2 l3 l4 l5 l6 l7
add r0, 4*FDEC_STRIDEB
pshuf%3 m0, m1, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
; void predict_8x8_vr( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_VR 3
-cglobal predict_8x8_vr, 2,3,7*(mmsize/16)
+cglobal predict_8x8_vr, 2,3,7
mova m2, [r1+16*SIZEOF_PIXEL]
movu m3, [r1+15*SIZEOF_PIXEL]
movu m1, [r1+14*SIZEOF_PIXEL]
; void predict_8x8_hd( pixel *src, pixel *edge )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_HD 4
-cglobal predict_8x8_hd, 2,2,8*(mmsize/16)
+cglobal predict_8x8_hd, 2,2,8
add r0, 4*FDEC_STRIDEB
mova m0, [r1] ; l7 .. .. .. .. .. .. ..
mova m1, [r1+ 8*SIZEOF_PIXEL] ; lt l0 l1 l2 l3 l4 l5 l6
cextern pb_01
cextern pd_1024
-%macro QUANT_DC_START_MMX 0
+%macro QUANT_DC_START 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%ifdef HIGH_BIT_DEPTH
SPLATD m6, m6
SPLATD m7, m7
-%else
- SPLATW m6, m6
- SPLATW m7, m7
-%endif ; HIGH_BIT_DEPTH
-%endmacro
-
-%macro QUANT_DC_START_SSSE3 0
+%elif cpuflag(sse4) ; ssse3, but not faster on conroe
movdqa m5, [pb_01]
- movd m6, r1m ; mf
- movd m7, r2m ; bias
pshufb m6, m5
pshufb m7, m5
+%else
+ SPLATW m6, m6
+ SPLATW m7, m7
+%endif
%endmacro
; PABSW mmx and PSIGNW mmx do not individually perform the same operations as
; int quant_2x2( int32_t dct[M*N], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 2
-cglobal quant_%1x%2_dc, 3,3,8*(mmsize/16)
- QUANT_DC_START_MMX
+cglobal quant_%1x%2_dc, 3,3,8
+ QUANT_DC_START
%if %1*%2 <= mmsize/4
QUANT_ONE_DC r0, m6, m7, 0
%else
; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
-cglobal quant_%1x%2, 3,3,8*(mmsize/16)
+cglobal quant_%1x%2, 3,3,8
%assign x 0
%rep %1*%2/(mmsize/2)
QUANT_TWO_AC r0+x, r1+x, r2+x, x
%endmacro
INIT_MMX mmx2
-%define QUANT_DC_START QUANT_DC_START_MMX
QUANT_DC quant_2x2_dc, 1
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
QUANT_DC quant_4x4_dc, 4
INIT_XMM sse4
;Not faster on Conroe, so only used in SSE4 versions
-%define QUANT_DC_START QUANT_DC_START_SSSE3
QUANT_DC quant_4x4_dc, 2, 8
QUANT_AC quant_4x4, 2
QUANT_AC quant_8x8, 8
; void dequant_4x4( dctcoef dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 3
-cglobal dequant_%1x%1, 0,3,6*(mmsize/16)
+cglobal dequant_%1x%1, 0,3,6
.skip_prologue:
DEQUANT_START %2+2, %2
%ifdef HIGH_BIT_DEPTH
INIT_XMM sse2
DEQUANT 4, 4, 1
-INIT_XMM sse4
-DEQUANT 4, 4, 1
-INIT_XMM sse2
DEQUANT 8, 6, 1
INIT_XMM sse4
+DEQUANT 4, 4, 1
DEQUANT 8, 6, 1
%else
%ifndef ARCH_X86_64
%endif
%macro DEQUANT_DC 2
-cglobal dequant_4x4dc, 0,3,6*(mmsize/16)
+cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
.lshift:
; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,8*(mmsize/16)
+cglobal denoise_dct, 4,4,8
pxor m6, m6
.loop:
sub r3, mmsize/2
; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,7*(mmsize/16)
+cglobal denoise_dct, 4,4,7
pxor m6, m6
.loop:
sub r3, mmsize
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
%macro INTRA_SAD16 0
-cglobal intra_sad_x3_16x16, 3,5,8*(mmsize/16)
+cglobal intra_sad_x3_16x16, 3,5,8
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
+ %if mmsize == 8
+ %assign xmm_regs_used 0
+ %endif
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
sub rsp, (xmm_regs_used-6)*16+16
INIT_CPUFLAGS %1
%endmacro
-INIT_MMX
+INIT_XMM
; I often want to use macros that permute their arguments. e.g. there's no
; efficient way to implement butterfly or transpose or dct without swapping some
;-----------------------------------------------------------------------------
; intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
+INIT_XMM
cglobal checkasm_call, 4,7,16
sub rsp, max_args*8
%assign stack_offset stack_offset+max_args*8