cglobal x264_pixel_satd_16x16_sse2
cglobal x264_pixel_sa8d_8x8_sse2
cglobal x264_pixel_sa8d_16x16_sse2
-cglobal x264_intra_sa8d_x3_8x8_sse2
+cglobal x264_intra_sa8d_x3_8x8_core_sse2
%macro SAD_INC_4x16P_SSE2 0
movdqu xmm1, [rdx]
-%macro LOAD_HADAMARD8 1
+ALIGN 16
+;-----------------------------------------------------------------------------
+; void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *fenc, int16_t edges[2][8], int *res )
+;-----------------------------------------------------------------------------
+x264_intra_sa8d_x3_8x8_core_sse2:
+ ; 8x8 hadamard
pxor xmm4, xmm4
- movq xmm0, [%1+0*FENC_STRIDE]
- movq xmm7, [%1+1*FENC_STRIDE]
- movq xmm6, [%1+2*FENC_STRIDE]
- movq xmm3, [%1+3*FENC_STRIDE]
- movq xmm5, [%1+4*FENC_STRIDE]
- movq xmm1, [%1+5*FENC_STRIDE]
- movq xmm8, [%1+6*FENC_STRIDE]
- movq xmm2, [%1+7*FENC_STRIDE]
+ movq xmm0, [parm1q+0*FENC_STRIDE]
+ movq xmm7, [parm1q+1*FENC_STRIDE]
+ movq xmm6, [parm1q+2*FENC_STRIDE]
+ movq xmm3, [parm1q+3*FENC_STRIDE]
+ movq xmm5, [parm1q+4*FENC_STRIDE]
+ movq xmm1, [parm1q+5*FENC_STRIDE]
+ movq xmm8, [parm1q+6*FENC_STRIDE]
+ movq xmm2, [parm1q+7*FENC_STRIDE]
punpcklbw xmm0, xmm4
punpcklbw xmm7, xmm4
punpcklbw xmm6, xmm4
HADAMARD1x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2
TRANSPOSE8x8 xmm0, xmm7, xmm6, xmm3, xmm5, xmm1, xmm8, xmm2, xmm4
HADAMARD1x8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
-%endmacro
-%macro SCALAR_SUMSUB 4
- add %1, %2
- add %3, %4
- add %2, %2
- add %4, %4
- sub %2, %1
- sub %4, %3
-%endmacro
-
-%macro SCALAR_HADAMARD1x8 9 ; 8x tmp, dst
- SCALAR_SUMSUB %1, %5, %2, %6
- SCALAR_SUMSUB %3, %7, %4, %8
- SCALAR_SUMSUB %1, %3, %2, %4
- SCALAR_SUMSUB %5, %7, %6, %8
- SCALAR_SUMSUB %1, %2, %3, %4
- SCALAR_SUMSUB %5, %6, %7, %8
- mov [%9+0], %1
- mov [%9+2], %2
- mov [%9+4], %3
- mov [%9+6], %4
- mov [%9+8], %5
- mov [%9+10], %6
- mov [%9+12], %7
- mov [%9+14], %8
-%endmacro
-
-; dest, left, right, src, tmp
-; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%macro PRED8x8_LOWPASS 5
- movq %5, %2
- pavgb %2, %3
- pxor %3, %5
- movq %1, %4
- pand %3, [pb_1 GLOBAL]
- psubusb %2, %3
- pavgb %1, %2
-%endmacro
-
-; output: mm0 = filtered t0..t7
-; assumes topleft is available
-%macro PRED8x8_LOAD_TOP_FILT 1
- movq mm1, [%1-1]
- movq mm2, [%1+1]
- and parm4d, byte 4
- jne .have_topright
- mov al, [%1+7]
- mov ah, al
- pinsrw mm2, eax, 3
-.have_topright:
- PRED8x8_LOWPASS mm0, mm1, mm2, [%1], mm7
-%endmacro
-
-%macro PRED8x8_LOAD_LEFT_FILT 10 ; 8x reg, tmp, src
- movzx %1, byte [%10-1*FDEC_STRIDE]
- movzx %2, byte [%10+0*FDEC_STRIDE]
- movzx %3, byte [%10+1*FDEC_STRIDE]
- movzx %4, byte [%10+2*FDEC_STRIDE]
- movzx %5, byte [%10+3*FDEC_STRIDE]
- movzx %6, byte [%10+4*FDEC_STRIDE]
- movzx %7, byte [%10+5*FDEC_STRIDE]
- movzx %8, byte [%10+6*FDEC_STRIDE]
- movzx %9, byte [%10+7*FDEC_STRIDE]
- lea %1, [%1+%2+1]
- lea %2, [%2+%3+1]
- lea %3, [%3+%4+1]
- lea %4, [%4+%5+1]
- lea %5, [%5+%6+1]
- lea %6, [%6+%7+1]
- lea %7, [%7+%8+1]
- lea %8, [%8+%9+1]
- lea %9, [%9+%9+1]
- add %1, %2
- add %2, %3
- add %3, %4
- add %4, %5
- add %5, %6
- add %6, %7
- add %7, %8
- add %8, %9
- shr %1, 2
- shr %2, 2
- shr %3, 2
- shr %4, 2
- shr %5, 2
- shr %6, 2
- shr %7, 2
- shr %8, 2
-%endmacro
-
-ALIGN 16
-;-----------------------------------------------------------------------------
-; void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t *fdec,
-; int *res, int i_neighbors )
-;-----------------------------------------------------------------------------
-x264_intra_sa8d_x3_8x8_sse2:
-%define left_1d rsp-16 ; +16
-%define top_1d rsp-32 ; +16
- push rbx
- push r12
- push r13
- push r14
- push r15
- LOAD_HADAMARD8 parm1q
-
- PRED8x8_LOAD_LEFT_FILT r8, r9, r10, r11, r12, r13, r14, r15, rax, parm2q-1
- SCALAR_HADAMARD1x8 r8d, r9d, r10d, r11d, r12d, r13d, r14d, r15d, left_1d
- mov edi, r8d ; dc
-
- PRED8x8_LOAD_TOP_FILT parm2q-FDEC_STRIDE
- movq [top_1d], mm0
- movzx r8d, byte [top_1d+0]
- movzx r9d, byte [top_1d+1]
- movzx r10d, byte [top_1d+2]
- movzx r11d, byte [top_1d+3]
- movzx r12d, byte [top_1d+4]
- movzx r13d, byte [top_1d+5]
- movzx r14d, byte [top_1d+6]
- movzx r15d, byte [top_1d+7]
- SCALAR_HADAMARD1x8 r8w, r9w, r10w, r11w, r12w, r13w, r14w, r15w, top_1d
- lea rdi, [rdi + r8 + 8] ; dc
+ ; dc
+ movzx edi, word [parm2q+0]
+ add di, word [parm2q+16]
+ add edi, 8
and edi, -16
shl edi, 2
SUM1x8_SSE2 xmm8, xmm10, xmm15
movdqa xmm14, xmm15 ; 7x8 sum
- movdqa xmm8, [left_1d] ; left edge
+ movdqa xmm8, [parm2q+0] ; left edge
movd xmm9, edi
psllw xmm8, 3
psubw xmm8, xmm0
punpckldq xmm0, xmm2
punpckldq xmm4, xmm6
punpcklqdq xmm0, xmm4 ; transpose
- movdqa xmm1, [top_1d]
+ movdqa xmm1, [parm2q+16] ; top edge
movdqa xmm2, xmm15
psllw xmm1, 3
psrldq xmm2, 2 ; 8x7 sum
shr eax, 2
mov [parm3q+0], eax ; i8x8_v sa8d
- pop r15
- pop r14
- pop r13
- pop r12
- pop rbx
ret
%include "amd64inc.asm"
%macro STORE8x8 2
+ movq [parm1q + 0*FDEC_STRIDE], %1
movq [parm1q + 1*FDEC_STRIDE], %1
movq [parm1q + 2*FDEC_STRIDE], %1
movq [parm1q + 3*FDEC_STRIDE], %1
- movq [parm1q + 4*FDEC_STRIDE], %1
+ movq [parm1q + 4*FDEC_STRIDE], %2
movq [parm1q + 5*FDEC_STRIDE], %2
movq [parm1q + 6*FDEC_STRIDE], %2
movq [parm1q + 7*FDEC_STRIDE], %2
- movq [parm1q + 8*FDEC_STRIDE], %2
%endmacro
%macro STORE16x16 2
ALIGN 16
pw_2: times 4 dw 2
+pw_4: times 4 dw 4
pw_8: times 4 dw 8
-pb_1: times 16 db 1
pw_3210:
dw 0
dw 1
dw 2
dw 3
ALIGN 16
+pb_1: times 16 db 1
pb_00s_ff:
times 8 db 0
pb_0s_ff:
cglobal predict_4x4_ddl_mmxext
cglobal predict_4x4_vl_mmxext
cglobal predict_8x8_v_mmxext
+cglobal predict_8x8_dc_mmxext
+cglobal predict_8x8_dc_top_mmxext
+cglobal predict_8x8_dc_left_mmxext
cglobal predict_8x8_ddl_mmxext
cglobal predict_8x8_ddl_sse2
cglobal predict_8x8_ddr_sse2
cglobal predict_8x8_vl_sse2
cglobal predict_8x8_vr_core_mmxext
-cglobal predict_8x8_dc_core_mmxext
cglobal predict_8x8c_v_mmx
cglobal predict_8x8c_dc_core_mmxext
cglobal predict_8x8c_p_core_mmxext
PRED8x8_LOWPASS0 %1, %2, %3, %4, %5, dqa
%endmacro
-; output: mm0 = filtered t0..t7
-%macro PRED8x8_LOAD_TOP_FILT 0
- sub parm1q, FDEC_STRIDE
-
- and parm2d, 12
- movq mm1, [parm1q-1]
- movq mm2, [parm1q+1]
-
- cmp parm2d, byte 8
- jge .have_topleft
- mov al, [parm1q]
- mov ah, al
- pinsrw mm1, eax, 0
-.have_topleft:
-
- and parm2d, byte 4
- jne .have_topright
- mov al, [parm1q+7]
- mov ah, al
- pinsrw mm2, eax, 3
-.have_topright:
-
- PRED8x8_LOWPASS mm0, mm1, mm2, [parm1q], mm7
-%endmacro
-
-; output: xmm0 = unfiltered t0..t15
-; xmm1 = unfiltered t1..t15
-; xmm2 = unfiltered tl..t14
-%macro PRED8x8_LOAD_TOP_TOPRIGHT_XMM 0
- sub parm1q, FDEC_STRIDE
-
- and parm2d, 12
- movdqu xmm1, [parm1q-1]
-
- cmp parm2d, byte 8
- jge .have_topleft
- mov al, [parm1q]
- mov ah, al
- pinsrw xmm1, eax, 0
-
-.have_topleft:
- and parm2d, byte 4
- jne .have_topright
-
- mov al, [parm1q+7]
- mov ah, al
- pinsrw xmm1, eax, 4
- pshufhw xmm1, xmm1, 0
- movdqa xmm0, xmm1
- movdqa xmm2, xmm1
- psrldq xmm0, 1
- psrldq xmm2, 2
- pshufhw xmm0, xmm0, 0
- pshufhw xmm2, xmm2, 0
- jmp .done_topright
-
-.have_topright:
- movdqu xmm0, [parm1q]
- movdqa xmm2, xmm0
- psrldq xmm2, 1
- mov al, [parm1q+15]
- mov ah, al
- pinsrw xmm2, eax, 7
-.done_topright:
-%endmacro
;-----------------------------------------------------------------------------
-;
; void predict_4x4_ddl_mmxext( uint8_t *src )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_4x4_vl_mmxext( uint8_t *src )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_v_mmxext( uint8_t *src, int i_neighbors )
-;
+; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_v_mmxext:
- PRED8x8_LOAD_TOP_FILT
+ movq mm0, [parm2q+16]
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_neighbors, uint8_t *pix_left );
-;
+; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
ALIGN 16
-predict_8x8_dc_core_mmxext:
- movq mm1, [parm3q-1]
- movq mm2, [parm3q+1]
- PRED8x8_LOWPASS mm4, mm1, mm2, [parm3q], mm7
-
- PRED8x8_LOAD_TOP_FILT
-
+predict_8x8_dc_mmxext:
+ pxor mm0, mm0
pxor mm1, mm1
- psadbw mm0, mm1
- psadbw mm4, mm1
+ psadbw mm0, [parm2q+7]
+ psadbw mm1, [parm2q+16]
paddw mm0, [pw_8 GLOBAL]
- paddw mm0, mm4
+ paddw mm0, mm1
psrlw mm0, 4
pshufw mm0, mm0, 0
packuswb mm0, mm0
+ STORE8x8 mm0, mm0
+ ret
+;-----------------------------------------------------------------------------
+; void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t *edge );
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_dc_top_mmxext:
+ pxor mm0, mm0
+ psadbw mm0, [parm2q+16]
+ paddw mm0, [pw_4 GLOBAL]
+ psrlw mm0, 3
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
+ STORE8x8 mm0, mm0
+ ret
+
+;-----------------------------------------------------------------------------
+; void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t *edge );
+;-----------------------------------------------------------------------------
+
+ALIGN 16
+predict_8x8_dc_left_mmxext:
+ pxor mm0, mm0
+ psadbw mm0, [parm2q+7]
+ paddw mm0, [pw_4 GLOBAL]
+ psrlw mm0, 3
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_ddl_mmxext( uint8_t *src, int i_neighbors )
-;
+; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_ddl_mmxext:
- sub parm1q, FDEC_STRIDE
+ movq mm5, [parm2q+16]
+ movq mm2, [parm2q+17]
+ movq mm3, [parm2q+23]
+ movq mm4, [parm2q+25]
+ movq mm1, mm5
+ psllq mm1, 8
+ PRED8x8_LOWPASS mm0, mm1, mm2, mm5, mm7
+ PRED8x8_LOWPASS mm1, mm3, mm4, [parm2q+24], mm6
- and parm2d, 12
- movq mm1, [parm1q-1]
- movq mm2, [parm1q+1]
-
- cmp parm2d, byte 8
- jge .have_topleft
- mov al, [parm1q]
- mov ah, al
- pinsrw mm1, eax, 0
-
-.have_topleft:
- and parm2d, byte 4
- jne .have_topright
-
- mov al, [parm1q+7]
- mov ah, [parm1q+7]
- pinsrw mm2, eax, 3
- pshufw mm3, mm2, 0xff
- jmp .done_topright
-
-.have_topright:
- movq mm5, [parm1q+9];
- mov al, [parm1q+15]
- mov ah, al
- pinsrw mm5, eax, 3
- movq mm4, [parm1q+7];
- PRED8x8_LOWPASS mm3, mm4, mm5, [parm1q+8], mm7
-.done_topright:
-
-;?0123456789abcdeff
-; [-mm0--][-mm3--]
-;[-mm1--][-mm4--]
-; [-mm2--][-mm5--]
-
- PRED8x8_LOWPASS mm0, mm1, mm2, [parm1q], mm7
- movq mm1, mm0
+%assign Y 7
+%rep 6
+ movq [parm1q+Y*FDEC_STRIDE], mm1
movq mm2, mm0
psllq mm1, 8
- psrlq mm2, 8
- movq mm6, mm3
- movq mm4, mm3
- psllq mm6, 56
- movq mm7, mm0
- por mm2, mm6
- psllq mm4, 8
- movq mm5, mm3
- movq mm6, mm3
- psrlq mm5, 8
- pand mm6, [pb_0s_ff GLOBAL]
- psrlq mm7, 56
- por mm5, mm6
- por mm4, mm7
- PRED8x8_LOWPASS mm6, mm1, mm2, mm0, mm7
- PRED8x8_LOWPASS mm7, mm4, mm5, mm3, mm2
-
-%assign Y 8
-%rep 6
- movq [parm1q+Y*FDEC_STRIDE], mm7
- movq mm1, mm6
- psllq mm7, 8
- psrlq mm1, 56
- psllq mm6, 8
- por mm7, mm1
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
%assign Y (Y-1)
%endrep
- movq [parm1q+Y*FDEC_STRIDE], mm7
- psllq mm7, 8
- psrlq mm6, 56
- por mm7, mm6
+ movq [parm1q+Y*FDEC_STRIDE], mm1
+ psllq mm1, 8
+ psrlq mm0, 56
+ por mm1, mm0
%assign Y (Y-1)
- movq [parm1q+Y*FDEC_STRIDE], mm7
+ movq [parm1q+Y*FDEC_STRIDE], mm1
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_ddl_sse2( uint8_t *src, int i_neighbors )
-;
+; void predict_8x8_ddl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_ddl_sse2:
- PRED8x8_LOAD_TOP_TOPRIGHT_XMM
-
-;?0123456789abcdeff
-; [-----xmm0-----]
-;[-----xmm1-----]
-; [-----xmm2-----]
-
- movdqa xmm3, [pb_00s_ff GLOBAL]
- PRED8x8_LOWPASS_XMM xmm4, xmm1, xmm2, xmm0, xmm5
- movdqa xmm1, xmm4
- movdqa xmm2, xmm4
- pand xmm3, xmm4
- psrldq xmm2, 1
+ movdqa xmm3, [parm2q+16]
+ movdqu xmm2, [parm2q+17]
+ movdqa xmm1, xmm3
pslldq xmm1, 1
- por xmm2, xmm3
- PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm4, xmm5
+ PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
-%assign Y 1
+%assign Y 0
%rep 8
psrldq xmm0, 1
movq [parm1q+Y*FDEC_STRIDE], xmm0
%assign Y (Y+1)
%endrep
-
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_ddr_sse2( uint8_t *src, int i_neighbors )
-;
+; void predict_8x8_ddr_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_ddr_sse2:
- lea r8, [rsp-24]
- movq mm0, [parm1q-FDEC_STRIDE]
- movq [r8+8], mm0
-
- and parm2d, byte 4
- mov al, [parm1q-FDEC_STRIDE+7]
- cmovnz ax, [parm1q-FDEC_STRIDE+8]
- mov [r8+16], al
-
- mov dh, [parm1q+3*FDEC_STRIDE-1]
- mov dl, [parm1q+4*FDEC_STRIDE-1]
- mov ah, [parm1q-1*FDEC_STRIDE-1]
- mov al, [parm1q+0*FDEC_STRIDE-1]
- shl edx, 16
- shl eax, 16
- mov dh, [parm1q+5*FDEC_STRIDE-1]
- mov dl, [parm1q+6*FDEC_STRIDE-1]
- mov ah, [parm1q+1*FDEC_STRIDE-1]
- mov al, [parm1q+2*FDEC_STRIDE-1]
- mov [r8+4], eax
- mov [r8], edx
- movzx eax, byte [parm1q+7*FDEC_STRIDE-1]
- movd xmm4, eax
- movzx edx, dl
- lea eax, [rax+2*rax+2]
- add eax, edx
- shr eax, 2
- movd xmm5, eax
-
-; r8 -> {l6 l5 l4 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t8}
-
- movdqu xmm0, [r8]
- movdqu xmm2, [r8+1]
- movdqa xmm1, xmm0
- pslldq xmm1, 1
- por xmm1, xmm4
- PRED8x8_LOWPASS_XMM xmm3, xmm1, xmm2, xmm0, xmm4
- movdqa xmm1, xmm3
+ movdqu xmm3, [parm2q+8]
+ movdqu xmm1, [parm2q+7]
movdqa xmm2, xmm3
- pslldq xmm1, 1
psrldq xmm2, 1
- por xmm1, xmm5
PRED8x8_LOWPASS_XMM xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm1, xmm0
%assign Y 7
%rep 3
movq [parm1q+Y*FDEC_STRIDE], xmm0
- psrldq xmm0, 2
movq [parm1q+(Y-1)*FDEC_STRIDE], xmm1
+ psrldq xmm0, 2
psrldq xmm1, 2
%assign Y (Y-2)
%endrep
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_vl_sse2( uint8_t *src, int i_neighbors )
-;
+; void predict_8x8_vl_sse2( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_vl_sse2:
- PRED8x8_LOAD_TOP_TOPRIGHT_XMM
- PRED8x8_LOWPASS_XMM xmm4, xmm1, xmm2, xmm0, xmm5
+ movdqa xmm4, [parm2q+16]
movdqa xmm2, xmm4
movdqa xmm1, xmm4
movdqa xmm3, xmm4
; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
; xmm3: (t0 + t1 + 1) >> 1
-%assign Y 1
+%assign Y 0
%rep 3
psrldq xmm0, 1
movq [parm1q+ Y *FDEC_STRIDE], xmm3
ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_vr_core_mmxext( uint8_t *src, int i_neighbors, uint16_t ltt0 )
-;
+; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
; fills only some pixels:
-; f0123456789abcdef
-; 0 .......
-; 1 ,,,,,,
-; 2 ......
-; 3 ,,,,,
-; 4 .....
-; 5 ,,,,
-; 6 ....
-; 7 ,,,
+; f01234567
+; 0........
+; 1,,,,,,,,
+; 2 .......
+; 3 ,,,,,,,
+; 4 ......
+; 5 ,,,,,,
+; 6 .....
+; 7 ,,,,,
ALIGN 16
predict_8x8_vr_core_mmxext:
- sub parm1q, FDEC_STRIDE
-
- movq mm1, [parm1q-1]
- movq mm2, [parm1q+1]
-
- and parm2d, byte 4
- jne .have_topright
- mov al, [parm1q+7]
- mov ah, al
- pinsrw mm2, eax, 3
-.have_topright:
-
- PRED8x8_LOWPASS mm4, mm1, mm2, [parm1q], mm7
- movq mm1, mm4
- movq mm2, mm4
- psllq mm1, 8
- movq mm3, mm4
- pinsrw mm1, parm3d, 0
- psrlq mm2, 8
- pavgb mm3, mm1
- PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm5
+ movq mm2, [parm2q+16]
+ movq mm3, [parm2q+15]
+ movq mm1, [parm2q+14]
+ movq mm4, mm3
+ pavgb mm3, mm2
+ PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
-%assign Y 1
+%assign Y 0
%rep 3
- psllq mm0, 8
movq [parm1q+ Y *FDEC_STRIDE], mm3
movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
psllq mm3, 8
+ psllq mm0, 8
%assign Y (Y+2)
%endrep
- psllq mm0, 8
movq [parm1q+ Y *FDEC_STRIDE], mm3
movq [parm1q+(Y+1)*FDEC_STRIDE], mm0
ret
;-----------------------------------------------------------------------------
-;
; void predict_8x8c_v_mmx( uint8_t *src )
-;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_v_mmx :
- sub parm1q, FDEC_STRIDE
- movq mm0, [parm1q]
+ movq mm0, [parm1q - FDEC_STRIDE]
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
-;
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
-;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_dc_core_mmxext:
- sub parm1q, FDEC_STRIDE
-
- movq mm0, [parm1q]
+ movq mm0, [parm1q - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
punpckhbw mm1, mm0
ret
;-----------------------------------------------------------------------------
-;
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_16x16_v_mmx( uint8_t *src )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
-;
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 2
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
-void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int *, int );
-void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int *, int );
+void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
+void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
#endif
%include "i386inc.asm"
-; this is faster than a constant [edx + Y*FDEC_STRIDE]
%macro STORE8x8 2
- movq [edx + ecx], %1 ; 0
- movq [edx + 2*ecx], %1 ; 1
- movq [edx + 4*ecx], %1 ; 3
- movq [edx + 8*ecx], %2 ; 7
- add edx, eax
- movq [edx ], %1 ; 2
- movq [edx + 2*ecx], %2 ; 4
- movq [edx + eax], %2 ; 5
- movq [edx + 4*ecx], %2 ; 6
+ movq [edx + 0*FDEC_STRIDE], %1
+ movq [edx + 1*FDEC_STRIDE], %1
+ movq [edx + 2*FDEC_STRIDE], %1
+ movq [edx + 3*FDEC_STRIDE], %1
+ movq [edx + 4*FDEC_STRIDE], %2
+ movq [edx + 5*FDEC_STRIDE], %2
+ movq [edx + 6*FDEC_STRIDE], %2
+ movq [edx + 7*FDEC_STRIDE], %2
%endmacro
%macro SAVE_0_1 1
ALIGN 8
pw_2: times 4 dw 2
+pw_4: times 4 dw 4
pw_8: times 4 dw 8
pb_1: times 8 db 1
pw_3210:
SECTION .text
cglobal predict_8x8_v_mmxext
-cglobal predict_8x8_dc_core_mmxext
+cglobal predict_8x8_dc_mmxext
+cglobal predict_8x8_dc_top_mmxext
+cglobal predict_8x8_dc_left_mmxext
+cglobal predict_8x8_ddl_mmxext
+cglobal predict_8x8_ddr_mmxext
+cglobal predict_8x8_vr_core_mmxext
cglobal predict_8x8c_v_mmx
cglobal predict_8x8c_dc_core_mmxext
cglobal predict_8x8c_p_core_mmxext
cglobal predict_16x16_dc_core_mmxext
cglobal predict_16x16_dc_top_mmxext
-%macro PRED8x8_LOWPASS 2
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm2, mm3
- movq %1 , %2
- pand mm2, [pb_1 GOT_ebx]
- psubusb mm1, mm2
- pavgb %1 , mm1 ; %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
-%endmacro
-
-%macro PRED8x8_LOAD_TOP 0
- mov edx, [picesp + 4]
- mov ecx, FDEC_STRIDE
- mov eax, [picesp + 8]
- sub edx, ecx
- and eax, 12
- movq mm1, [edx-1]
- movq mm2, [edx+1]
+; dest, left, right, src, tmp
+; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
+%macro PRED8x8_LOWPASS 5
+ movq %5, %2
+ pavgb %2, %3
+ pxor %3, %5
+ movq %1, %4
+ pand %3, [pb_1 GOT_ebx]
+ psubusb %2, %3
+ pavgb %1, %2
+%endmacro
- cmp eax, byte 8
- jge .have_topleft
- mov al, [edx]
- mov ah, al
- pinsrw mm1, eax, 0
- mov eax, [picesp + 8]
-.have_topleft:
- and eax, byte 4
- jne .have_topright
- mov al, [edx+7]
- mov ah, al
- pinsrw mm2, eax, 3
-.have_topright:
+;-----------------------------------------------------------------------------
+; void predict_8x8_v_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
- PRED8x8_LOWPASS mm0, [edx]
-%endmacro
+ALIGN 16
+predict_8x8_v_mmxext:
+ mov eax, [esp+8]
+ mov edx, [esp+4]
+ movq mm0, [eax+16]
+ STORE8x8 mm0, mm0
+ ret
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_v_mmxext( uint8_t *src, int i_neighbors )
-;
+; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
-predict_8x8_v_mmxext:
+predict_8x8_dc_mmxext:
picpush ebx
picgetgot ebx
-
- PRED8x8_LOAD_TOP
- lea eax, [ecx + 2*ecx]
+ mov eax, [picesp + 8]
+ mov edx, [picesp + 4]
+ pxor mm0, mm0
+ pxor mm1, mm1
+ psadbw mm0, [eax+7]
+ psadbw mm1, [eax+16]
+ paddw mm0, [pw_8 GOT_ebx]
+ paddw mm0, mm1
+ psrlw mm0, 4
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
STORE8x8 mm0, mm0
+ picpop ebx
+ ret
+;-----------------------------------------------------------------------------
+; void predict_8x8_top_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+%macro PRED8x8_DC 2
+ALIGN 16
+%1:
+ picpush ebx
+ picgetgot ebx
+ mov eax, [picesp + 8]
+ mov edx, [picesp + 4]
+ pxor mm0, mm0
+ psadbw mm0, [eax+%2]
+ paddw mm0, [pw_4 GOT_ebx]
+ psrlw mm0, 3
+ pshufw mm0, mm0, 0
+ packuswb mm0, mm0
+ STORE8x8 mm0, mm0
picpop ebx
ret
+%endmacro
+
+PRED8x8_DC predict_8x8_dc_top_mmxext, 16
+PRED8x8_DC predict_8x8_dc_left_mmxext, 7
;-----------------------------------------------------------------------------
-;
-; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_neighbors, uint8_t *pix_left );
-;
+; void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t *edge )
;-----------------------------------------------------------------------------
ALIGN 16
-predict_8x8_dc_core_mmxext:
+predict_8x8_ddl_mmxext:
picpush ebx
picgetgot ebx
+ mov eax, [picesp + 8]
+ mov edx, [picesp + 4]
+ movq mm1, [eax + 15]
+ movq mm2, [eax + 17]
+ movq mm3, [eax + 23]
+ movq mm4, [eax + 25]
+ PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 16], mm7
+ PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 24], mm6
+
+%assign Y 7
+%rep 6
+ movq [edx + Y*FDEC_STRIDE], mm1
+ movq mm2, mm0
+ psllq mm1, 8
+ psrlq mm2, 56
+ psllq mm0, 8
+ por mm1, mm2
+%assign Y (Y-1)
+%endrep
+ movq [edx + Y*FDEC_STRIDE], mm1
+ psllq mm1, 8
+ psrlq mm0, 56
+ por mm1, mm0
+%assign Y (Y-1)
+ movq [edx + Y*FDEC_STRIDE], mm1
- mov eax, [picesp + 12]
- movq mm1, [eax-1]
- movq mm2, [eax+1]
- PRED8x8_LOWPASS mm4, [eax]
+ ret
- PRED8x8_LOAD_TOP
+;-----------------------------------------------------------------------------
+; void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
- pxor mm1, mm1
- psadbw mm0, mm1
- psadbw mm4, mm1
- paddw mm0, [pw_8 GOT_ebx]
- paddw mm0, mm4
- psrlw mm0, 4
- pshufw mm0, mm0, 0
- packuswb mm0, mm0
+ALIGN 16
+predict_8x8_ddr_mmxext:
+ picpush ebx
+ picgetgot ebx
+ mov eax, [picesp + 8]
+ mov edx, [picesp + 4]
+ movq mm1, [eax + 7]
+ movq mm2, [eax + 9]
+ movq mm3, [eax + 15]
+ movq mm4, [eax + 17]
+ PRED8x8_LOWPASS mm0, mm1, mm2, [eax + 8], mm7
+ PRED8x8_LOWPASS mm1, mm3, mm4, [eax + 16], mm6
+
+%assign Y 7
+%rep 6
+ movq [edx + Y*FDEC_STRIDE], mm0
+ movq mm2, mm1
+ psrlq mm0, 8
+ psllq mm2, 56
+ psrlq mm1, 8
+ por mm0, mm2
+%assign Y (Y-1)
+%endrep
+ movq [edx + Y*FDEC_STRIDE], mm0
+ psrlq mm0, 8
+ psllq mm1, 56
+ por mm0, mm1
+%assign Y (Y-1)
+ movq [edx + Y*FDEC_STRIDE], mm0
- lea eax, [ecx + 2*ecx]
- STORE8x8 mm0, mm0
+ ret
- picpop ebx
+;-----------------------------------------------------------------------------
+; void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t *edge )
+;-----------------------------------------------------------------------------
+
+; fills only some pixels:
+; f01234567
+; 0........
+; 1,,,,,,,,
+; 2 .......
+; 3 ,,,,,,,
+; 4 ......
+; 5 ,,,,,,
+; 6 .....
+; 7 ,,,,,
+
+ALIGN 16
+predict_8x8_vr_core_mmxext:
+ picpush ebx
+ picgetgot ebx
+ mov eax, [picesp + 8]
+ mov edx, [picesp + 4]
+ movq mm2, [eax + 16]
+ movq mm3, [eax + 15]
+ movq mm1, [eax + 14]
+ movq mm4, mm3
+ pavgb mm3, mm2
+ PRED8x8_LOWPASS mm0, mm1, mm2, mm4, mm7
+
+%assign Y 0
+%rep 3
+ movq [edx + Y *FDEC_STRIDE], mm3
+ movq [edx + (Y+1)*FDEC_STRIDE], mm0
+ psllq mm3, 8
+ psllq mm0, 8
+%assign Y (Y+2)
+%endrep
+ movq [edx + Y *FDEC_STRIDE], mm3
+ movq [edx + (Y+1)*FDEC_STRIDE], mm0
+
ret
;-----------------------------------------------------------------------------
-;
; void predict_8x8c_v_mmx( uint8_t *src )
-;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_v_mmx :
mov edx, [esp + 4]
- mov ecx, FDEC_STRIDE
- sub edx, ecx
- movq mm0, [edx]
- lea eax, [ecx + 2*ecx]
+ movq mm0, [edx - FDEC_STRIDE]
STORE8x8 mm0, mm0
ret
;-----------------------------------------------------------------------------
-;
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
-;
;-----------------------------------------------------------------------------
ALIGN 16
picgetgot ebx
mov edx, [picesp + 4]
- mov ecx, FDEC_STRIDE
- sub edx, ecx
- lea eax, [ecx + 2*ecx]
- movq mm0, [edx]
+ movq mm0, [edx - FDEC_STRIDE]
pxor mm1, mm1
pxor mm2, mm2
punpckhbw mm1, mm0
ret
;-----------------------------------------------------------------------------
-;
; void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_16x16_v_mmx( uint8_t *src )
-;
;-----------------------------------------------------------------------------
ALIGN 16
ret
;-----------------------------------------------------------------------------
-;
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
-;
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 3
extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
extern void predict_8x8c_v_mmx( uint8_t *src );
-extern void predict_8x8_v_mmxext( uint8_t *src, int i_neighbors );
-extern void predict_8x8_ddl_mmxext( uint8_t *src, int i_neighbors );
-extern void predict_8x8_ddl_sse2( uint8_t *src, int i_neighbors );
-extern void predict_8x8_ddr_sse2( uint8_t *src, int i_neighbors );
-extern void predict_8x8_vl_sse2( uint8_t *src, int i_neighbors );
-extern void predict_8x8_vr_core_mmxext( uint8_t *src, int i_neighbors, uint16_t ltt0 );
-extern void predict_8x8_dc_core_mmxext( uint8_t *src, int i_neighbors, uint8_t *pix_left );
+extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_ddl_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_ddr_mmxext( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_ddl_sse2( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_ddr_sse2( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
+extern void predict_8x8_vr_core_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_4x4_ddl_mmxext( uint8_t *src );
extern void predict_4x4_vl_mmxext( uint8_t *src );
predict_8x8c_dc_core_mmxext( src, s2, s3 );
}
-#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-static void predict_8x8_dc( uint8_t *src, int i_neighbor )
-{
- uint8_t l[10];
- l[0] = i_neighbor&MB_TOPLEFT ? SRC(-1,-1) : SRC(-1,0);
- l[1] = SRC(-1,0);
- l[2] = SRC(-1,1);
- l[3] = SRC(-1,2);
- l[4] = SRC(-1,3);
- l[5] = SRC(-1,4);
- l[6] = SRC(-1,5);
- l[7] = SRC(-1,6);
- l[8] =
- l[9] = SRC(-1,7);
-
- predict_8x8_dc_core_mmxext( src, i_neighbor, l+1 );
-}
-
#ifdef ARCH_X86_64
static void predict_16x16_h( uint8_t *src )
{
****************************************************************************/
#define PL(y) \
- int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_LEFT(have_tl) \
- int l0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(-1,0)) \
- + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
- PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
- UNUSED int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
-
+ UNUSED int l##y = edge[14-y];
#define PT(x) \
- int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOP(have_tl) \
- int t0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(0,-1)) \
- + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
- PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
- UNUSED int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \
- + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \
-
-#define PTR(x) \
- t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOPRIGHT \
- int t8, t9, t10, t11, t12, t13, t14, t15; \
- if(i_neighbor&MB_TOPRIGHT) { \
- PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
- t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
- } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
-
+ UNUSED int t##x = edge[16+x];
#define PREDICT_8x8_LOAD_TOPLEFT \
- int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+ int lt = edge[15];
+#define PREDICT_8x8_LOAD_LEFT \
+ PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
+#define PREDICT_8x8_LOAD_TOP \
+ PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
#define PREDICT_8x8_DC(v) \
int y; \
src += FDEC_STRIDE; \
}
-#define SRC4(x,y) *(uint32_t*)&SRC(x,y)
-
-#if 0
-static void predict_8x8_ddl( uint8_t *src, int i_neighbor )
-{
- PREDICT_8x8_LOAD_TOP(0)
- uint32_t vec0, vec1;
- int t8b;
- if(i_neighbor&MB_TOPRIGHT)
- {
- PREDICT_8x8_LOAD_TOPRIGHT
- vec1 = (F2(t14,t15,t15)<<24)
- + (F2(t13,t14,t15)<<16)
- + (F2(t12,t13,t14)<< 8)
- + (F2(t11,t12,t13)<< 0);
- vec0 = (F2(t10,t11,t12)<<24)
- + (F2( t9,t10,t11)<<16)
- + (F2( t8, t9,t10)<< 8)
- + (F2( t7, t8, t9)<< 0);
- t8b = t8;
- }
- else
- {
- t8b = SRC(7,-1);
- vec1 = t8b * 0x01010101;
- vec0 = (vec1&0xffffff00) + F2(t7,t8b,t8b);
- }
- SRC4(4,7) = vec1;
- SRC4(4,3) =
- SRC4(0,7) = vec0;
- SRC4(4,6) = vec1 = (vec1<<8) + (vec0>>24);
- SRC4(4,2) =
- SRC4(0,6) = vec0 = (vec0<<8) + F2(t6,t7,t8b);
- SRC4(4,5) = vec1 = (vec1<<8) + (vec0>>24);
- SRC4(4,1) =
- SRC4(0,5) = vec0 = (vec0<<8) + F2(t5,t6,t7);
- SRC4(4,4) = vec1 = (vec1<<8) + (vec0>>24);
- SRC4(4,0) =
- SRC4(0,4) = vec0 = (vec0<<8) + F2(t4,t5,t6);
- SRC4(0,3) = vec0 = (vec0<<8) + F2(t3,t4,t5);
- SRC4(0,2) = vec0 = (vec0<<8) + F2(t2,t3,t4);
- SRC4(0,1) = vec0 = (vec0<<8) + F2(t1,t2,t3);
- SRC4(0,0) = vec0 = (vec0<<8) + F2(t0,t1,t2);
-}
-#endif
+#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
-static void predict_8x8_ddr( uint8_t *src, int i_neighbor )
+static void predict_8x8_vr_mmxext( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_TOP(1)
- PREDICT_8x8_LOAD_LEFT(1)
- PREDICT_8x8_LOAD_TOPLEFT
- uint32_t vec0, vec1;
- vec1 = (F2(t7,t6,t5)<<24)
- + (F2(t6,t5,t4)<<16)
- + (F2(t5,t4,t3)<< 8)
- + (F2(t4,t3,t2)<< 0);
- vec0 = (F2(t3,t2,t1)<<24)
- + (F2(t2,t1,t0)<<16)
- + (F2(t1,t0,lt)<< 8)
- + (F2(t0,lt,l0)<< 0);
- SRC4(4,0) = vec1;
- SRC4(0,0) =
- SRC4(4,4) = vec0;
- SRC4(4,1) = vec1 = (vec1<<8) + (vec0>>24);
- SRC4(0,1) =
- SRC4(4,5) = vec0 = (vec0<<8) + F2(lt,l0,l1);
- SRC4(4,2) = vec1 = (vec1<<8) + (vec0>>24);
- SRC4(0,2) =
- SRC4(4,6) = vec0 = (vec0<<8) + F2(l0,l1,l2);
- SRC4(4,3) = vec1 = (vec1<<8) + (vec0>>24);
- SRC4(0,3) =
- SRC4(4,7) = vec0 = (vec0<<8) + F2(l1,l2,l3);
- SRC4(0,4) = vec0 = (vec0<<8) + F2(l2,l3,l4);
- SRC4(0,5) = vec0 = (vec0<<8) + F2(l3,l4,l5);
- SRC4(0,6) = vec0 = (vec0<<8) + F2(l4,l5,l6);
- SRC4(0,7) = vec0 = (vec0<<8) + F2(l5,l6,l7);
-}
-
-#ifdef ARCH_X86_64
-static void predict_8x8_vr_mmxext( uint8_t *src, int i_neighbor )
-{
- PREDICT_8x8_LOAD_TOPLEFT
- const int t0 = F2(SRC(-1,-1), SRC(0,-1), SRC(1,-1));
- predict_8x8_vr_core_mmxext( src, i_neighbor, lt+(t0<<8) );
+ predict_8x8_vr_core_mmxext( src, edge );
{
- PREDICT_8x8_LOAD_LEFT(1)
- SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
+ PREDICT_8x8_LOAD_TOPLEFT
+ PREDICT_8x8_LOAD_LEFT
SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
}
}
-#endif
-#ifdef ARCH_X86
#define SUMSUB(a,b,c,d,e,f,g,h)\
t=a; a+=b; b-=t;\
t=c; c+=d; d-=t;\
t=e; e+=f; f-=t;\
t=g; g+=h; h-=t;
-void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t *src, int res[3], int i_neighbor )
+#ifdef ARCH_X86_64
+void x264_intra_sa8d_x3_8x8_sse2( uint8_t *fenc, uint8_t edge[33], int res[3] )
+#else
+void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *fenc, uint8_t edge[33], int res[3] )
+#endif
{
- PREDICT_8x8_LOAD_TOP(1)
- PREDICT_8x8_LOAD_LEFT(1)
+ PREDICT_8x8_LOAD_TOP
+ PREDICT_8x8_LOAD_LEFT
int t;
- int16_t edges[2][8];
+ DECLARE_ALIGNED( int16_t, sa8d_1d[2][8], 16 );
SUMSUB(l0,l4,l1,l5,l2,l6,l3,l7);
SUMSUB(l0,l2,l1,l3,l4,l6,l5,l7);
SUMSUB(l0,l1,l2,l3,l4,l5,l6,l7);
- edges[0][0] = l0;
- edges[0][1] = l1;
- edges[0][2] = l2;
- edges[0][3] = l3;
- edges[0][4] = l4;
- edges[0][5] = l5;
- edges[0][6] = l6;
- edges[0][7] = l7;
+ sa8d_1d[0][0] = l0;
+ sa8d_1d[0][1] = l1;
+ sa8d_1d[0][2] = l2;
+ sa8d_1d[0][3] = l3;
+ sa8d_1d[0][4] = l4;
+ sa8d_1d[0][5] = l5;
+ sa8d_1d[0][6] = l6;
+ sa8d_1d[0][7] = l7;
SUMSUB(t0,t4,t1,t5,t2,t6,t3,t7);
SUMSUB(t0,t2,t1,t3,t4,t6,t5,t7);
SUMSUB(t0,t1,t2,t3,t4,t5,t6,t7);
- edges[1][0] = t0;
- edges[1][1] = t1;
- edges[1][2] = t2;
- edges[1][3] = t3;
- edges[1][4] = t4;
- edges[1][5] = t5;
- edges[1][6] = t6;
- edges[1][7] = t7;
- x264_intra_sa8d_x3_8x8_core_mmxext( fenc, edges, res );
-}
+ sa8d_1d[1][0] = t0;
+ sa8d_1d[1][1] = t1;
+ sa8d_1d[1][2] = t2;
+ sa8d_1d[1][3] = t3;
+ sa8d_1d[1][4] = t4;
+ sa8d_1d[1][5] = t5;
+ sa8d_1d[1][6] = t6;
+ sa8d_1d[1][7] = t7;
+#ifdef ARCH_X86_64
+ x264_intra_sa8d_x3_8x8_core_sse2( fenc, sa8d_1d, res );
+#else
+ x264_intra_sa8d_x3_8x8_core_mmxext( fenc, sa8d_1d, res );
#endif
+}
/****************************************************************************
* Exported functions:
void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] )
{
pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
- pf[I_PRED_8x8_DC] = predict_8x8_dc;
- pf[I_PRED_8x8_DDR] = predict_8x8_ddr;
-#ifdef ARCH_X86_64 // x86 not written yet
+ pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
+ pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
+ pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
pf[I_PRED_8x8_DDL] = predict_8x8_ddl_mmxext;
pf[I_PRED_8x8_VR] = predict_8x8_vr_mmxext;
+#ifdef ARCH_X86
+ pf[I_PRED_8x8_DDR] = predict_8x8_ddr_mmxext;
#endif
}
MB_TOPLEFT = 0x08,
MB_PRIVATE = 0x10,
+
+ ALL_NEIGHBORS = 0xf,
+};
+
+static const int x264_pred_i4x4_neighbors[13] =
+{
+ [I_PRED_4x4_HU] = MB_LEFT,
+ [I_PRED_4x4_H] = MB_LEFT,
+ [I_PRED_4x4_HD] = MB_LEFT | MB_TOPLEFT | MB_TOP,
+ [I_PRED_4x4_DDR] = MB_LEFT | MB_TOPLEFT | MB_TOP,
+ [I_PRED_4x4_VR] = MB_LEFT | MB_TOPLEFT | MB_TOP,
+ [I_PRED_4x4_V] = MB_TOP,
+ [I_PRED_4x4_VL] = MB_TOP | MB_TOPRIGHT,
+ [I_PRED_4x4_DDL] = MB_TOP | MB_TOPRIGHT,
+ [I_PRED_4x4_DC] = MB_LEFT | MB_TOP,
+ [I_PRED_4x4_DC_LEFT] = MB_LEFT,
+ [I_PRED_4x4_DC_TOP] = MB_TOP,
+ [I_PRED_4x4_DC_128] = 0
};
void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] );
void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] );
- void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t *fdec, int res[3], int i_neighbors );
+ void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
#define SRC(x,y) src[(x)+(y)*FDEC_STRIDE]
#define PL(y) \
- const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_LEFT(have_tl) \
- const int l0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(-1,0)) \
- + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
- PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
- UNUSED const int l7 = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
-
+ edge[14-y] = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
#define PT(x) \
- const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOP(have_tl) \
- const int t0 = ((have_tl || (i_neighbor&MB_TOPLEFT) ? SRC(-1,-1) : SRC(0,-1)) \
- + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
- PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
- UNUSED const int t7 = ((i_neighbor&MB_TOPRIGHT ? SRC(8,-1) : SRC(7,-1)) \
- + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2; \
-
-#define PTR(x) \
- t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
-#define PREDICT_8x8_LOAD_TOPRIGHT \
- int t8, t9, t10, t11, t12, t13, t14, t15; \
- if(i_neighbor&MB_TOPRIGHT) { \
- PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
- t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
- } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
+ edge[16+x] = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
+
+void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters )
+{
+ /* edge[7..14] = l7..l0
+ * edge[15] = lt
+ * edge[16..31] = t0 .. t15
+ * edge[32] = t15 */
+
+ int have_lt = i_neighbor & MB_TOPLEFT;
+ if( i_filters & MB_LEFT )
+ {
+ edge[15] = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+ edge[14] = ((have_lt ? SRC(-1,-1) : SRC(-1,0))
+ + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2;
+ PL(1) PL(2) PL(3) PL(4) PL(5) PL(6)
+ edge[7] = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2;
+ }
+
+ if( i_filters & MB_TOP )
+ {
+ int have_tr = i_neighbor & MB_TOPRIGHT;
+ edge[16] = ((have_lt ? SRC(-1,-1) : SRC(0,-1))
+ + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2;
+ PT(1) PT(2) PT(3) PT(4) PT(5) PT(6)
+ edge[23] = ((have_tr ? SRC(8,-1) : SRC(7,-1))
+ + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2;
+
+ if( i_filters & MB_TOPRIGHT )
+ {
+ if( have_tr )
+ {
+ PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14)
+ edge[31] =
+ edge[32] = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2;
+ }
+ else
+ {
+ *(uint64_t*)(edge+24) = SRC(7,-1) * 0x0101010101010101ULL;
+ edge[32] = SRC(7,-1);
+ }
+ }
+ }
+}
+#undef PL
+#undef PT
+
+#define PL(y) \
+ UNUSED const int l##y = edge[14-y];
+#define PT(x) \
+ UNUSED const int t##x = edge[16+x];
#define PREDICT_8x8_LOAD_TOPLEFT \
- const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2;
+ const int lt = edge[15];
+#define PREDICT_8x8_LOAD_LEFT \
+ PL(0) PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) PL(7)
+#define PREDICT_8x8_LOAD_TOP \
+ PT(0) PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) PT(7)
+#define PREDICT_8x8_LOAD_TOPRIGHT \
+ PT(8) PT(9) PT(10) PT(11) PT(12) PT(13) PT(14) PT(15)
#define PREDICT_8x8_DC(v) \
int y; \
src += FDEC_STRIDE; \
}
-static void predict_8x8_dc_128( uint8_t *src, int i_neighbor )
+static void predict_8x8_dc_128( uint8_t *src, uint8_t edge[33] )
{
PREDICT_8x8_DC(0x80808080);
}
-static void predict_8x8_dc_left( uint8_t *src, int i_neighbor )
+static void predict_8x8_dc_left( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_LEFT(0)
+ PREDICT_8x8_LOAD_LEFT
const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
PREDICT_8x8_DC(dc);
}
-static void predict_8x8_dc_top( uint8_t *src, int i_neighbor )
+static void predict_8x8_dc_top( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_TOP(0)
+ PREDICT_8x8_LOAD_TOP
const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
PREDICT_8x8_DC(dc);
}
-static void predict_8x8_dc( uint8_t *src, int i_neighbor )
+static void predict_8x8_dc( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_LEFT(0)
- PREDICT_8x8_LOAD_TOP(0)
+ PREDICT_8x8_LOAD_LEFT
+ PREDICT_8x8_LOAD_TOP
const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
PREDICT_8x8_DC(dc);
}
-static void predict_8x8_h( uint8_t *src, int i_neighbor )
+static void predict_8x8_h( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_LEFT(0)
+ PREDICT_8x8_LOAD_LEFT
#define ROW(y) ((uint32_t*)(src+y*FDEC_STRIDE))[0] =\
((uint32_t*)(src+y*FDEC_STRIDE))[1] = 0x01010101U * l##y
ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW
}
-static void predict_8x8_v( uint8_t *src, int i_neighbor )
+static void predict_8x8_v( uint8_t *src, uint8_t edge[33] )
{
+ const uint64_t top = *(uint64_t*)(edge+16);
int y;
- PREDICT_8x8_LOAD_TOP(0);
- src[0] = t0;
- src[1] = t1;
- src[2] = t2;
- src[3] = t3;
- src[4] = t4;
- src[5] = t5;
- src[6] = t6;
- src[7] = t7;
- for( y = 1; y < 8; y++ )
- *(uint64_t*)(src+y*FDEC_STRIDE) = *(uint64_t*)src;
-}
-static void predict_8x8_ddl( uint8_t *src, int i_neighbor )
-{
- PREDICT_8x8_LOAD_TOP(0)
+ for( y = 0; y < 8; y++ )
+ *(uint64_t*)(src+y*FDEC_STRIDE) = top;
+}
+static void predict_8x8_ddl( uint8_t *src, uint8_t edge[33] )
+{
+ PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_TOPRIGHT
SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
}
-static void predict_8x8_ddr( uint8_t *src, int i_neighbor )
+static void predict_8x8_ddr( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_TOP(1)
- PREDICT_8x8_LOAD_LEFT(1)
+ PREDICT_8x8_LOAD_TOP
+ PREDICT_8x8_LOAD_LEFT
PREDICT_8x8_LOAD_TOPLEFT
SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
}
-static void predict_8x8_vr( uint8_t *src, int i_neighbor )
+static void predict_8x8_vr( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_TOP(1)
- PREDICT_8x8_LOAD_LEFT(1)
+ PREDICT_8x8_LOAD_TOP
+ PREDICT_8x8_LOAD_LEFT
PREDICT_8x8_LOAD_TOPLEFT
SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
SRC(7,0)= (t6 + t7 + 1) >> 1;
}
-static void predict_8x8_hd( uint8_t *src, int i_neighbor )
+static void predict_8x8_hd( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_TOP(1)
- PREDICT_8x8_LOAD_LEFT(1)
+ PREDICT_8x8_LOAD_TOP
+ PREDICT_8x8_LOAD_LEFT
PREDICT_8x8_LOAD_TOPLEFT
SRC(0,7)= (l6 + l7 + 1) >> 1;
SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
}
-static void predict_8x8_vl( uint8_t *src, int i_neighbor )
+static void predict_8x8_vl( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_TOP(0)
+ PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_TOPRIGHT
SRC(0,0)= (t0 + t1 + 1) >> 1;
SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
SRC(7,6)= (t10 + t11 + 1) >> 1;
SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
}
-static void predict_8x8_hu( uint8_t *src, int i_neighbor )
+static void predict_8x8_hu( uint8_t *src, uint8_t edge[33] )
{
- PREDICT_8x8_LOAD_LEFT(0)
+ PREDICT_8x8_LOAD_LEFT
SRC(0,0)= (l0 + l1 + 1) >> 1;
SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
#define _PREDICT_H 1
typedef void (*x264_predict_t)( uint8_t *src );
-typedef void (*x264_predict8x8_t)( uint8_t *src, int i_neighbor );
+typedef void (*x264_predict8x8_t)( uint8_t *src, uint8_t edge[33] );
enum intra_chroma_pred_e
{
I_PRED_8x8_DC_128 = 11,
};
+// FIXME enforce edge alignment via uint64_t ?
+void x264_predict_8x8_filter( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
+
void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_4x4_init ( int cpu, x264_predict_t pf[12] );
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
+ DECLARE_ALIGNED( uint8_t, edge[33], 8 );
x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8];
int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
int i_pred_mode = x264_mb_predict_intra4x4_mode( h, 4*idx );
predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+ x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
if( b_merged_satd && i_max == 9 )
{
int satd[3];
- h->pixf.intra_sa8d_x3_8x8( p_src_by, p_dst_by, satd, h->mb.i_neighbour8[idx] );
+ h->pixf.intra_sa8d_x3_8x8( p_src_by, edge, satd );
if( i_pred_mode < 3 )
satd[i_pred_mode] -= 3 * a->i_lambda;
for( i=2; i>=0; i-- )
int i_satd;
int i_mode = predict_mode[i];
- h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
+ h->predict_8x8[i_mode]( p_dst_by, edge );
i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
break;
/* we need to encode this block now (for next ones) */
- h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, h->mb.i_neighbour8[idx] );
+ h->predict_8x8[a->i_predict8x8[idx]]( p_dst_by, edge );
x264_mb_encode_i8x8( h, idx, a->i_qp );
x264_macroblock_cache_intra8x8_pred( h, 2*x, 2*y, a->i_predict8x8[idx] );
}
else if( h->mb.i_type == I_8x8 )
{
+ DECLARE_ALIGNED( uint8_t, edge[33], 8 );
for( idx = 0; idx < 4; idx++ )
{
uint64_t pels_h = 0;
p_src_by = p_src + 8*x + 8*y*FENC_STRIDE;
p_dst_by = p_dst + 8*x + 8*y*FDEC_STRIDE;
predict_4x4_mode_available( h->mb.i_neighbour8[idx], predict_mode, &i_max );
+ x264_predict_8x8_filter( p_dst_by, edge, h->mb.i_neighbour8[idx], ALL_NEIGHBORS );
+
for( i = 0; i < i_max; i++ )
{
i_mode = predict_mode[i];
if( a->i_satd_i8x8_dir[i_mode][idx] > i_thresh )
continue;
- h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
+ h->predict_8x8[i_mode]( p_dst_by, edge );
i_satd = x264_rd_cost_i8x8( h, a->i_lambda2, idx, i_mode );
if( i_best > i_satd )
}
else if( h->mb.i_type == I_8x8 )
{
+ DECLARE_ALIGNED( uint8_t, edge[33], 8 );
h->mb.b_transform_8x8 = 1;
for( i = 0; i < 4; i++ )
{
uint8_t *p_dst = &h->mb.pic.p_fdec[0][8 * (i&1) + 8 * (i>>1) * FDEC_STRIDE];
int i_mode = h->mb.cache.intra4x4_pred_mode[x264_scan8[4*i]];
- h->predict_8x8[i_mode]( p_dst, h->mb.i_neighbour8[i] );
+ x264_predict_8x8_filter( p_dst, edge, h->mb.i_neighbour8[i], x264_pred_i4x4_neighbors[i_mode] );
+ h->predict_8x8[i_mode]( p_dst, edge );
x264_mb_encode_i8x8( h, i, i_qp );
}
}
x264_predict_t predict_8x8c[4+3];
x264_predict_t predict_4x4[9+3];
x264_predict8x8_t predict_8x8[9+3];
+ DECLARE_ALIGNED( uint8_t, edge[33], 8 );
int ret = 0, ok, used_asm;
int i;
x264_predict_8x8c_init( 0, predict_8x8c );
x264_predict_8x8_init( 0, predict_8x8 );
x264_predict_4x4_init( 0, predict_4x4 );
+ x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
#define TEST_PIXEL( name ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
TEST_PIXEL_X(3);
TEST_PIXEL_X(4);
-#define TEST_INTRA_SATD( name, pred, satd, ... ) \
+#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
int res_c[3], res_asm[3]; \
pred[i]( buf3+40, ##__VA_ARGS__ ); \
res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
} \
- pixel_asm.name( buf1+40, buf3+40, res_asm, ##__VA_ARGS__ ); \
+ pixel_asm.name( buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
}
ok = 1; used_asm = 0;
- TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16] );
- TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8] );
- TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4] );
- TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8],
- MB_LEFT|MB_TOP|MB_TOPLEFT|MB_TOPRIGHT );
- TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8],
- MB_LEFT|MB_TOP|MB_TOPLEFT );
+ TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
+ TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
+ TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
+ TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
report( "intra satd_x3 :" );
return ret;
{
int ret = 0, ok = 1, used_asm = 0;
int i;
+ DECLARE_ALIGNED( uint8_t, edge[33], 8 );
struct
{
x264_predict_t predict_16x16[4+3];
x264_predict_8x8_init( cpu_new, ip_a.predict_8x8 );
x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
+ x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
+
#define INTRA_TEST( name, dir, ... ) \
if( ip_a.name[dir] != ip_ref.name[dir] )\
{ \
{\
fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\
ok = 0;\
+ int j,k;\
+ for(k=-1; k<16; k++)\
+ printf("%2x ", edge[16+k]);\
+ printf("\n");\
+ for(j=0; j<8; j++){\
+ printf("%2x ", edge[j]);\
+ for(k=0; k<8; k++)\
+ printf("%2x ", buf4[48+k+j*32]);\
+ printf("\n");\
+ }\
+ printf("\n");\
+ for(j=0; j<8; j++){\
+ printf(" ");\
+ for(k=0; k<8; k++)\
+ printf("%2x ", buf3[48+k+j*32]);\
+ printf("\n");\
+ }\
}\
}
for( i = 0; i < 7; i++ )
INTRA_TEST( predict_16x16, i );
for( i = 0; i < 12; i++ )
- INTRA_TEST( predict_8x8, i, 0xf );
- for( i = 0; i < 12; i++ )
- INTRA_TEST( predict_8x8, i, MB_LEFT|MB_TOP|MB_TOPLEFT );
- INTRA_TEST( predict_8x8, I_PRED_8x8_V, MB_LEFT|MB_TOP );
- INTRA_TEST( predict_8x8, I_PRED_8x8_DC, MB_LEFT|MB_TOP );
- INTRA_TEST( predict_8x8, I_PRED_8x8_DDL,MB_LEFT|MB_TOP );
- INTRA_TEST( predict_8x8, I_PRED_8x8_V, MB_LEFT|MB_TOP|MB_TOPRIGHT );
- INTRA_TEST( predict_8x8, I_PRED_8x8_DC, MB_LEFT|MB_TOP|MB_TOPRIGHT );
- INTRA_TEST( predict_8x8, I_PRED_8x8_DDL,MB_LEFT|MB_TOP|MB_TOPRIGHT );
+ INTRA_TEST( predict_8x8, i, edge );
report( "intra pred :" );
return ret;