;*****************************************************************************
;* deblock-a.asm: x86 deblocking
;*****************************************************************************
-;* Copyright (C) 2005-2011 x264 project
+;* Copyright (C) 2005-2012 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
cextern pw_00ff
cextern pw_pixel_max
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro ABS_SUB 5
%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma, 5,5,8
%assign pad 5*mmsize+12-(stack_offset&15)
RET
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
paddw t0, %3, %2
mova t2, %4
paddw t2, %3
LOAD_AB t0, t1, r2d, r3d
mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
mova %2, t0 ; mask0
psrlw t3, %1, 2
%else
%endif
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA_INTRA_64 0
cglobal deblock_v_luma_intra, 4,7,16
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,16
%define t0 m15
%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra, 4,7,8
LUMA_INTRA_INIT 3
RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra, 4,7,8
LUMA_INTRA_INIT 8
RET
%endmacro
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
psubusb %5, %2, %1
psubusb %4, %1, %2
%else
mova %4, %2
%endmacro
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
cglobal deblock_v_luma, 5,5,10
RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
-cglobal deblock_h_luma, 5,7
- movsxd r10, r1d
- lea r11, [r10+r10*2]
- lea r6, [r0-4]
- lea r5, [r0-4+r11]
-%ifdef WIN64
- sub rsp, 0x98
+cglobal deblock_h_luma, 5,9
+ lea r8, [r1*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
+%if WIN64
+ sub rsp, 0x98
%define pix_tmp rsp+0x30
%else
- sub rsp, 0x68
+ sub rsp, 0x68
%define pix_tmp rsp
%endif
; transpose 6x16 -> tmp space
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r10, r11), pix_tmp+8
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp
+ lea r6, [r6+r1*8]
+ lea r5, [r5+r1*8]
+ TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r1, r8), pix_tmp+8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
- ; don't backup r6, r5, r10, r11 because deblock_v_luma_sse2 doesn't use them
+ ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
+ mov r7, r1
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
-%ifdef WIN64
+%if WIN64
mov [rsp+0x20], r4
%endif
call deblock_v_luma
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
-%ifdef WIN64
+%if WIN64
add rsp, 0x98
%else
add rsp, 0x68
%macro DEBLOCK_LUMA 2
;-----------------------------------------------------------------------------
-; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v8_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma, 5,5
lea r4, [r1*3]
RET
;-----------------------------------------------------------------------------
-; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma, 0,5
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pavgb t0, p2, p1
pavgb t1, p0, q0
%else
%endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
paddb t2, p2, p1
paddb t3, p0, q0
%else
pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pavgb t1, p2, q1
psubb t2, p2, q1
%else
%define t1 m5
%define t2 m6
%define t3 m7
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
%define p2 m8
%define q2 m9
%define t4 m10
%endif
;-----------------------------------------------------------------------------
-; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_intra, 4,6,16
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
sub esp, 0x60
%endif
lea r4, [r1*4]
mova p0, [r4+r5]
mova q0, [r0]
mova q1, [r0+r1]
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
pxor mpb_0, mpb_0
mova mpb_1, [pb_1]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
add esp, 0x60
%endif
RET
INIT_MMX cpuname
-%ifdef ARCH_X86_64
+%if ARCH_X86_64
;-----------------------------------------------------------------------------
-; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_luma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_luma_intra, 4,7
- movsxd r10, r1d
- lea r11, [r10*3]
- lea r6, [r0-4]
- lea r5, [r0-4+r11]
- sub rsp, 0x88
+cglobal deblock_h_luma_intra, 4,9
+ lea r8, [r1*3]
+ lea r6, [r0-4]
+ lea r5, [r0-4+r8]
+ sub rsp, 0x88
%define pix_tmp rsp
; transpose 8x16 -> tmp space
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
- lea r6, [r6+r10*8]
- lea r5, [r5+r10*8]
- TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
-
- lea r0, [pix_tmp+0x40]
- mov r1, 0x10
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
+ lea r6, [r6+r1*8]
+ lea r5, [r5+r1*8]
+ TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r1, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
+
+ mov r7, r1
+ lea r0, [pix_tmp+0x40]
+ mov r1, 0x10
call deblock_v_luma_intra
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
- lea r5, [r6+r11]
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- shl r10, 3
- sub r6, r10
- sub r5, r10
- shr r10, 3
- TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11)
- add rsp, 0x88
+ lea r5, [r6+r8]
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ shl r7, 3
+ sub r6, r7
+ sub r5, r7
+ shr r7, 3
+ TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
+ add rsp, 0x88
RET
%else
cglobal deblock_h_luma_intra, 2,4
DEBLOCK_LUMA_INTRA v
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_LUMA_INTRA v8
%endif
%endif ; !HIGH_BIT_DEPTH
-%ifdef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
ret
;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma, 7,7,8
+cglobal deblock_v_chroma, 5,7,8
FIX_STRIDES r1
mov r5, r0
sub r0, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
add r1, r1
ret
;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_mbaff( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_mbaff, 5,7,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422_intra( uint16_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_422_intra( uint16_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422_intra, 4,6,8
add r1, r1
REP_RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_422( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_422( uint16_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_422, 5,7,8
add r1, r1
REP_RET
%endmacro ; DEBLOCK_CHROMA
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
DEBLOCK_CHROMA
%endif ; HIGH_BIT_DEPTH
-%ifndef HIGH_BIT_DEPTH
+%if HIGH_BIT_DEPTH == 0
%macro CHROMA_V_START 0
dec r2d ; alpha-1
dec r3d ; beta-1
ret
;-----------------------------------------------------------------------------
-; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_v_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma, 5,6,8
CHROMA_V_START
RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma, 5,7,8
CHROMA_H_START
DEBLOCK_CHROMA
INIT_XMM avx
DEBLOCK_CHROMA
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_mbaff( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
+; void deblock_h_chroma_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_H_CHROMA_420_MBAFF 0
cglobal deblock_h_chroma_mbaff, 5,7,8
INIT_XMM sse2
DEBLOCK_H_CHROMA_420_MBAFF
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
INIT_MMX mmx2
DEBLOCK_H_CHROMA_420_MBAFF
%endif
%macro DEBLOCK_H_CHROMA_422 0
-cglobal deblock_h_chroma_422, 5,7,8
-%ifdef ARCH_X86_64
- %define cntr r11
+cglobal deblock_h_chroma_422, 5,8,8
+%if ARCH_X86_64
+ %define cntr r7
%else
%define cntr dword r0m
%endif
%macro DEBLOCK_CHROMA_INTRA 0
;-----------------------------------------------------------------------------
-; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_v_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra, 4,5,8
CHROMA_V_START
RET
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra, 4,6,8
CHROMA_H_START
DEBLOCK_CHROMA_INTRA
INIT_MMX mmx2
DEBLOCK_CHROMA_INTRA_BODY
-%ifndef ARCH_X86_64
+%if ARCH_X86_64 == 0
DEBLOCK_CHROMA_INTRA
%endif
;-----------------------------------------------------------------------------
-; void deblock_h_chroma_intra_mbaff( uint8_t *pix, int stride, int alpha, int beta )
+; void deblock_h_chroma_intra_mbaff( uint8_t *pix, intptr_t stride, int alpha, int beta )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal deblock_h_chroma_intra_mbaff, 4,6,8
%endmacro
%macro LOAD_BYTES_XMM 1
- movu m0, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
+ movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
movu m1, [%1+12]
- mova m2, m0
- pslldq m0, 1
+ pslldq m0, m2, 1
shufps m2, m1, q3131 ; cur nnz, all rows
pslldq m1, 1
shufps m0, m1, q3131 ; left neighbors
RET
%macro DEBLOCK_STRENGTH_XMM 0
-cglobal deblock_strength, 6,6,8
+cglobal deblock_strength, 6,6,7
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
mova m2, [mv+4*8*2]
mova m1, [mv+4*8*3]
palignr m3, m2, [mv+4*8*2-16], 12
- palignr m7, m1, [mv+4*8*3-16], 12
psubw m2, m3
- psubw m1, m7
+ palignr m3, m1, [mv+4*8*3-16], 12
+ psubw m1, m3
packsswb m2, m1
%else
movu m0, [mv-4+4*8*0]