;* Dylan Yudaken <dyudaken@gmail.com>
;* Holger Lubitz <holger@lubitz.org>
;* Min Chen <chenm001.163.com>
+;* Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
times 8 db 2
times 8 db 4
times 8 db 6
+sq_1: times 1 dq 1
SECTION .text
+cextern pb_0
+cextern pw_1
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
cextern pw_00ff
+cextern pw_pixel_max
cextern sw_64
;=============================================================================
; P frame explicit weighted prediction
;=============================================================================
+%ifdef X264_HIGH_BIT_DEPTH
+%macro WEIGHT_START 1 ; (width)
+ movd m2, [r4+32] ; denom
+ movd m3, [r4+36] ; scale
+ mov TMP_REG, [r4+40] ; offset
+ mova m0, [pw_1]
+ shl TMP_REG, BIT_DEPTH-7
+ mova m4, [pw_pixel_max]
+ add TMP_REG, 1
+ psllw m0, m2 ; 1<<denom
+ movd m1, TMP_REG ; 1+(offset<<(BIT_DEPTH-8+1))
+ psllw m3, 1 ; scale<<1
+ punpcklwd m3, m1
+ SPLATD m3, m3
+ paddw m2, [sq_1] ; denom+1
+%endmacro
+
+%macro WEIGHT 2 ; (src1, src2)
+ movh m5, [%1]
+ movh m6, [%2]
+ punpcklwd m5, m0
+ punpcklwd m6, m0
+ pmaddwd m5, m3
+ pmaddwd m6, m3
+ psrad m5, m2
+ psrad m6, m2
+ packssdw m5, m6
+%endmacro
+
+%macro WEIGHT_TWO_ROW 3 ; (src, dst, width)
+ %assign x 0
+%rep (%3+mmsize/2-1)/(mmsize/2)
+%if %3-x/2 <= 4 && mmsize == 16
+ WEIGHT %1+x, %1+r3+x
+ CLIPW m5, [pb_0], m4
+ movh [%2+x], m5
+ movhps [%2+r1+x], m5
+%else
+ WEIGHT %1+x, %1+x+mmsize/2
+ SWAP m5, m7
+ WEIGHT %1+r3+x, %1+r3+x+mmsize/2
+ CLIPW m5, [pb_0], m4
+ CLIPW m7, [pb_0], m4
+ mova [%2+x], m7
+ mova [%2+r1+x], m5
+%endif
+ %assign x x+mmsize
+%endrep
+%endmacro
+
+%else ; !X264_HIGH_BIT_DEPTH
+
%macro WEIGHT_START 1
mova m3, [r4]
mova m6, [r4+16]
%endrep
%endmacro
+%endif ; X264_HIGH_BIT_DEPTH
+
;-----------------------------------------------------------------------------
;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
%define NUMREGS 6
%define LOAD_HEIGHT
%define HEIGHT_REG r5d
+%define TMP_REG r6d
%else
%define NUMREGS 5
+%define TMP_REG r5d
%define LOAD_HEIGHT mov r4d, r5m
%define HEIGHT_REG r4d
%endif
+%assign XMMREGS 7
+%ifdef X264_HIGH_BIT_DEPTH
+%assign NUMREGS NUMREGS+1
+%assign XMMREGS 8
+%endif
+
%macro WEIGHTER 2
- cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, 7
+ cglobal mc_weight_w%1_%2, NUMREGS, NUMREGS, XMMREGS*(mmsize/16)
+ FIX_STRIDES r1, r3
WEIGHT_START %1
LOAD_HEIGHT
.loop:
WEIGHTER 8, sse2
WEIGHTER 16, sse2
WEIGHTER 20, sse2
+%ifdef X264_HIGH_BIT_DEPTH
+WEIGHTER 12, sse2
+%else
%define WEIGHT WEIGHT_SSSE3
%define WEIGHT_START WEIGHT_START_SSSE3
INIT_MMX
WEIGHTER 8, ssse3
WEIGHTER 16, ssse3
WEIGHTER 20, ssse3
+%endif
%macro OFFSET_OP 7
mov%6 m0, [%1]
; pixel avg2
;=============================================================================
+%ifdef X264_HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void pixel_avg2_wN( uint16_t *dst, int dst_stride,
+; uint16_t *src1, int src_stride,
+; uint16_t *src2, int height );
+;-----------------------------------------------------------------------------
+%macro AVG2_W_ONE 2
+cglobal pixel_avg2_w%1_%2, 6,7,4*(mmsize/16)
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2]
+ movu m1, [r2+r3*2]
+%if mmsize == 8
+ pavgw m0, [r2+r4]
+ pavgw m1, [r2+r6]
+%else
+ movu m2, [r2+r4]
+ movu m3, [r2+r6]
+ pavgw m0, m2
+ pavgw m1, m3
+%endif
+ mova [r0], m0
+ mova [r0+r1*2], m1
+ sub r5d, 2
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ jg .height_loop
+ REP_RET
+%endmacro
+
+%macro AVG2_W_TWO 4
+cglobal pixel_avg2_w%1_%4, 6,7,8*(mmsize/16)
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2]
+ %2 m1, [r2+mmsize]
+ movu m2, [r2+r3*2]
+ %2 m3, [r2+r3*2+mmsize]
+%if mmsize == 8
+ pavgw m0, [r2+r4]
+ pavgw m1, [r2+r4+mmsize]
+ pavgw m2, [r2+r6]
+ pavgw m3, [r2+r6+mmsize]
+%else
+ movu m4, [r2+r4]
+ %2 m5, [r2+r4+mmsize]
+ movu m6, [r2+r6]
+ %2 m7, [r2+r6+mmsize]
+ pavgw m0, m4
+ pavgw m1, m5
+ pavgw m2, m6
+ pavgw m3, m7
+%endif
+ mova [r0], m0
+ %3 [r0+mmsize], m1
+ mova [r0+r1*2], m2
+ %3 [r0+r1*2+mmsize], m3
+ sub r5d, 2
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ jg .height_loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+AVG2_W_ONE 4, mmxext
+AVG2_W_TWO 8, movu, mova, mmxext
+INIT_XMM
+AVG2_W_ONE 8, sse2
+AVG2_W_TWO 10, movd, movd, sse2
+AVG2_W_TWO 16, movu, mova, sse2
+
+INIT_MMX
+cglobal pixel_avg2_w10_mmxext, 6,7
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+ 8]
+ movh m2, [r2+16]
+ movu m3, [r2+r3*2+ 0]
+ movu m4, [r2+r3*2+ 8]
+ movh m5, [r2+r3*2+16]
+ pavgw m0, [r2+r4+ 0]
+ pavgw m1, [r2+r4+ 8]
+ pavgw m2, [r2+r4+16]
+ pavgw m3, [r2+r6+ 0]
+ pavgw m4, [r2+r6+ 8]
+ pavgw m5, [r2+r6+16]
+ mova [r0+ 0], m0
+ mova [r0+ 8], m1
+ movh [r0+16], m2
+ mova [r0+r1*2+ 0], m3
+ mova [r0+r1*2+ 8], m4
+ movh [r0+r1*2+16], m5
+ sub r5d, 2
+ lea r2, [r2+r3*2*2]
+ lea r0, [r0+r1*2*2]
+ jg .height_loop
+ REP_RET
+
+cglobal pixel_avg2_w16_mmxext, 6,7
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+ 8]
+ movu m2, [r2+16]
+ movu m3, [r2+24]
+ movu m4, [r2+r3*2+ 0]
+ movu m5, [r2+r3*2+ 8]
+ movu m6, [r2+r3*2+16]
+ movu m7, [r2+r3*2+24]
+ pavgw m0, [r2+r4+ 0]
+ pavgw m1, [r2+r4+ 8]
+ pavgw m2, [r2+r4+16]
+ pavgw m3, [r2+r4+24]
+ pavgw m4, [r2+r6+ 0]
+ pavgw m5, [r2+r6+ 8]
+ pavgw m6, [r2+r6+16]
+ pavgw m7, [r2+r6+24]
+ mova [r0+ 0], m0
+ mova [r0+ 8], m1
+ mova [r0+16], m2
+ mova [r0+24], m3
+ mova [r0+r1*2+ 0], m4
+ mova [r0+r1*2+ 8], m5
+ mova [r0+r1*2+16], m6
+ mova [r0+r1*2+24], m7
+ sub r5d, 2
+ lea r2, [r2+r3*2*2]
+ lea r0, [r0+r1*2*2]
+ jg .height_loop
+ REP_RET
+
+cglobal pixel_avg2_w18_mmxext, 6,7
+ sub r4, r2
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+ 8]
+ movu m2, [r2+16]
+ movu m3, [r2+24]
+ movh m4, [r2+32]
+ pavgw m0, [r2+r4+ 0]
+ pavgw m1, [r2+r4+ 8]
+ pavgw m2, [r2+r4+16]
+ pavgw m3, [r2+r4+24]
+ pavgw m4, [r2+r4+32]
+ mova [r0+ 0], m0
+ mova [r0+ 8], m1
+ mova [r0+16], m2
+ mova [r0+24], m3
+ movh [r0+32], m4
+ sub r5d, 1
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ jg .height_loop
+ REP_RET
+
+INIT_XMM
+cglobal pixel_avg2_w18_sse2, 6,7,6
+ sub r4, r2
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+16]
+ movh m2, [r2+32]
+ movu m3, [r2+r4+ 0]
+ movu m4, [r2+r4+16]
+ movh m5, [r2+r4+32]
+ pavgw m0, m3
+ pavgw m1, m4
+ pavgw m2, m5
+ mova [r0+ 0], m0
+ mova [r0+16], m1
+ movh [r0+32], m2
+ sub r5d, 1
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ jg .height_loop
+ REP_RET
+%endif ; X264_HIGH_BIT_DEPTH
+
+%ifndef X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
; uint8_t *src1, int src_stride,
add r0, r1
dec r5d
jg .height_loop
- RET
+ REP_RET
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
%assign j j+1
%assign k k+1
%endrep
+%endif ; !X264_HIGH_BIT_DEPTH
;=============================================================================
; pixel copy
%1 [r0+%3], m3
%endmacro
+%ifdef X264_HIGH_BIT_DEPTH
+%macro COPY_ONE 6
+ COPY4 %1, %2, %3, %4
+%endmacro
+
+%macro COPY_TWO 6
+ %2 m0, [r2+%5]
+ %2 m1, [r2+%6]
+ %2 m2, [r2+r3+%5]
+ %2 m3, [r2+r3+%6]
+ %2 m4, [r2+r3*2+%5]
+ %2 m5, [r2+r3*2+%6]
+ %2 m6, [r2+%4+%5]
+ %2 m7, [r2+%4+%6]
+ %1 [r0+%5], m0
+ %1 [r0+%6], m1
+ %1 [r0+r1+%5], m2
+ %1 [r0+r1+%6], m3
+ %1 [r0+r1*2+%5], m4
+ %1 [r0+r1*2+%6], m5
+ %1 [r0+%3+%5], m6
+ %1 [r0+%3+%6], m7
+%endmacro
+
+INIT_MMX
+cglobal mc_copy_w4_mmx, 4,6
+ FIX_STRIDES r1, r3
+ cmp dword r4m, 4
+ lea r5, [r3*3]
+ lea r4, [r1*3]
+ je .end
+ COPY4 mova, mova, r4, r5
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+.end
+ COPY4 movu, mova, r4, r5
+ RET
+
+cglobal mc_copy_w16_mmx, 5,7
+ FIX_STRIDES r1, r3
+ lea r6, [r3*3]
+ lea r5, [r1*3]
+.height_loop:
+ COPY_TWO mova, movu, r5, r6, mmsize*0, mmsize*1
+ COPY_TWO mova, movu, r5, r6, mmsize*2, mmsize*3
+ sub r4d, 4
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ jg .height_loop
+ REP_RET
+
+%macro MC_COPY 5
+cglobal mc_copy_w%2_%4, 5,7,%5
+ FIX_STRIDES r1, r3
+ lea r6, [r3*3]
+ lea r5, [r1*3]
+.height_loop:
+ COPY_%1 mova, %3, r5, r6, 0, mmsize
+ sub r4d, 4
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ jg .height_loop
+ REP_RET
+%endmacro
+
+MC_COPY TWO, 8, movu, mmx, 0
+INIT_XMM
+MC_COPY ONE, 8, movu, sse2, 0
+MC_COPY TWO, 16, movu, sse2, 8
+MC_COPY TWO, 16, mova, aligned_sse2, 8
+%endif ; X264_HIGH_BIT_DEPTH
+
+%ifndef X264_HIGH_BIT_DEPTH
INIT_MMX
;-----------------------------------------------------------------------------
; void mc_copy_w4( uint8_t *dst, int i_dst_stride,
; but with SSE3 the overhead is zero, so there's no reason not to include it.
COPY_W16_SSE2 mc_copy_w16_sse3, lddqu
COPY_W16_SSE2 mc_copy_w16_aligned_sse2, movdqa
+%endif ; !X264_HIGH_BIT_DEPTH
sar t1d, 3
imul t0d, r4d
lea t0d, [t0+t1*2]
+ FIX_STRIDES t0d
movsxdifnidn t0, t0d
add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
%endmacro
+%ifdef X264_HIGH_BIT_DEPTH
+%macro UNPACK_UNALIGNED 4
+ movu %1, [%4+0]
+ movu %2, [%4+4]
+ mova %3, %1
+ punpcklwd %1, %2
+ punpckhwd %3, %2
+ mova %2, %1
+%if mmsize == 8
+ punpcklwd %1, %3
+ punpckhwd %2, %3
+%else
+ shufps %1, %3, 10001000b
+ shufps %2, %3, 11011101b
+%endif
+%endmacro
+%else ; !X264_HIGH_BIT_DEPTH
%macro UNPACK_UNALIGNED_MEM 3
punpcklwd %1, %3
%endmacro
movh %2, %3
punpcklwd %1, %2
%endmacro
+%endif ; X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
%macro MC_CHROMA 1
cglobal mc_chroma_%1, 0,6
MC_CHROMA_START
+ FIX_STRIDES r4
and r5d, 7
%ifdef ARCH_X86_64
jz .mc1dy
pshufd m5, m5, 0x55
jg .width8
%endif
+%ifdef X264_HIGH_BIT_DEPTH
+ add r2, r2
+ UNPACK_UNALIGNED m0, m1, m2, r3
+%else
movu m0, [r3]
UNPACK_UNALIGNED m0, m1, [r3+2]
mova m1, m0
pand m0, [pw_00ff]
psrlw m1, 8
+%endif ; X264_HIGH_BIT_DEPTH
pmaddwd m0, m7
pmaddwd m1, m7
packssdw m0, m1
SWAP m3, m0
ALIGN 4
.loop2:
+%ifdef X264_HIGH_BIT_DEPTH
+ UNPACK_UNALIGNED m0, m1, m2, r3+r4
+ pmullw m3, m6
+%else ; !X264_HIGH_BIT_DEPTH
movu m0, [r3+r4]
UNPACK_UNALIGNED m0, m1, [r3+r4+2]
pmullw m3, m6
mova m1, m0
pand m0, [pw_00ff]
psrlw m1, 8
+%endif ; X264_HIGH_BIT_DEPTH
pmaddwd m0, m7
pmaddwd m1, m7
mova m2, [pw_32]
pmullw m0, m5
paddw m0, m2
psrlw m0, 6
+%ifdef X264_HIGH_BIT_DEPTH
+ movh [r0], m0
+%if mmsize == 8
+ psrlq m0, 32
+ movh [r1], m0
+%else
+ movhps [r1], m0
+%endif
+%else ; !X264_HIGH_BIT_DEPTH
packuswb m0, m0
movd [r0], m0
%if mmsize==8
psrldq m0, 4
%endif
movd [r1], m0
+%endif ; X264_HIGH_BIT_DEPTH
add r3, r4
add r0, r2
add r1, r2
mova multy0, m5
%endif
%endif
+ FIX_STRIDES r2
.loopx:
+%ifdef X264_HIGH_BIT_DEPTH
+ UNPACK_UNALIGNED m0, m2, m4, r3
+ UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
+%else
movu m0, [r3]
movu m1, [r3+mmsize/2]
UNPACK_UNALIGNED m0, m2, [r3+2]
pand m1, [pw_00ff]
psrlw m2, 8
psrlw m3, 8
+%endif
pmaddwd m0, m7
pmaddwd m2, m7
pmaddwd m1, m7
add r3, r4
ALIGN 4
.loop4:
+%ifdef X264_HIGH_BIT_DEPTH
+ UNPACK_UNALIGNED m0, m1, m2, r3
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ packssdw m0, m1
+ UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
+ pmaddwd m1, m7
+ pmaddwd m2, m7
+ packssdw m1, m2
+%else ; !X264_HIGH_BIT_DEPTH
movu m0, [r3]
movu m1, [r3+mmsize/2]
UNPACK_UNALIGNED m0, m2, [r3+2]
pmaddwd m3, m7
packssdw m0, m2
packssdw m1, m3
+%endif ; X264_HIGH_BIT_DEPTH
pmullw m4, m6
pmullw m5, m6
mova m2, [pw_32]
paddw m1, m3
psrlw m0, 6
psrlw m1, 6
+%ifdef X264_HIGH_BIT_DEPTH
+ movh [r0], m0
+ movh [r0+mmsize/2], m1
+%if mmsize==8
+ psrlq m0, 32
+ psrlq m1, 32
+ movh [r1], m0
+ movh [r1+mmsize/2], m1
+%else
+ movhps [r1], m0
+ movhps [r1+mmsize/2], m1
+%endif
+%else ; !X264_HIGH_BIT_DEPTH
packuswb m0, m1
%if mmsize==8
pshufw m1, m0, 0x8
movq [r0], m0
movhps [r1], m0
%endif
+%endif ; X264_HIGH_BIT_DEPTH
add r3, r4
add r0, r2
add r1, r2
REP_RET
.width8:
%ifdef ARCH_X86_64
- lea r3, [t2+8]
- lea r0, [t0+4]
- lea r1, [t1+4]
+ lea r3, [t2+8*SIZEOF_PIXEL]
+ lea r0, [t0+4*SIZEOF_PIXEL]
+ lea r1, [t1+4*SIZEOF_PIXEL]
%else
mov r3, r3m
mov r0, r0m
mov r1, r1m
- add r3, 8
- add r0, 4
- add r1, 4
+ add r3, 8*SIZEOF_PIXEL
+ add r0, 4*SIZEOF_PIXEL
+ add r1, 4*SIZEOF_PIXEL
%endif
mov r5d, r8m
jmp .loopx
jmp .mc1d
.mc1dx:
movd m5, r5d
- mov r6d, 2
+ mov r6d, 2*SIZEOF_PIXEL
.mc1d:
+%ifdef X264_HIGH_BIT_DEPTH
+%if mmsize == 16
+ WIN64_SPILL_XMM 8
+%endif
+%endif
mova m4, [pw_8]
SPLATW m5, m5
psubw m4, m5
movifnidn r0, r0mp
movifnidn r1, r1mp
movifnidn r2d, r2m
+ FIX_STRIDES r2
movifnidn r5d, r8m
cmp dword r7m, 4
jg .mc1d_w8
shr r5d, 1
%endif
.loop1d_w4:
+%ifdef X264_HIGH_BIT_DEPTH
+%if mmsize == 8
+ movq m0, [r3+0]
+ movq m2, [r3+8]
+ movq m1, [r3+r6+0]
+ movq m3, [r3+r6+8]
+%else
+ movu m0, [r3]
+ movu m1, [r3+r6]
+ add r3, r11
+ movu m2, [r3]
+ movu m3, [r3+r6]
+%endif
+ SBUTTERFLY wd, 0, 2, 6
+ SBUTTERFLY wd, 1, 3, 7
+ SBUTTERFLY wd, 0, 2, 6
+ SBUTTERFLY wd, 1, 3, 7
+%if mmsize == 16
+ SBUTTERFLY wd, 0, 2, 6
+ SBUTTERFLY wd, 1, 3, 7
+%endif
+%else ; !X264_HIGH_BIT_DEPTH
movq m0, [r3]
movq m1, [r3+r6]
%if mmsize!=8
pand m1, [pw_00ff]
psrlw m2, 8
psrlw m3, 8
+%endif ; X264_HIGH_BIT_DEPTH
pmullw m0, m4
pmullw m1, m5
pmullw m2, m4
paddw m2, m3
psrlw m0, 3
psrlw m2, 3
+%ifdef X264_HIGH_BIT_DEPTH
+%if mmsize == 8
+ xchg r4, r11
+ xchg r2, r10
+%endif
+ movq [r0], m0
+ movq [r1], m2
+%if mmsize == 16
+ add r0, r10
+ add r1, r10
+ movhps [r0], m0
+ movhps [r1], m2
+%endif
+%else ; !X264_HIGH_BIT_DEPTH
packuswb m0, m2
%if mmsize==8
xchg r4, r11
movd [r0], m0
movd [r1], m1
%endif
+%endif ; X264_HIGH_BIT_DEPTH
add r3, r4
add r0, r2
add r1, r2
jg .loop1d_w4
REP_RET
.mc1d_w8:
- sub r2, 4
- sub r4, 8
- mov r10, 4
- mov r11, 8
+ sub r2, 4*SIZEOF_PIXEL
+ sub r4, 8*SIZEOF_PIXEL
+ mov r10, 4*SIZEOF_PIXEL
+ mov r11, 8*SIZEOF_PIXEL
%if mmsize==8
shl r5d, 1
%endif
REP_RET
%endmacro
+%ifdef X264_HIGH_BIT_DEPTH
+INIT_MMX
+MC_CHROMA mmxext
+INIT_XMM
+MC_CHROMA sse2
+%else ; !X264_HIGH_BIT_DEPTH
INIT_MMX
%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
MC_CHROMA mmxext
MC_CHROMA sse2
MC_CHROMA_SSSE3
MC_CHROMA_SSSE3 _cache64
+%endif ; X264_HIGH_BIT_DEPTH
;* Fiona Glaser <fiona@x264.com>
;* Holger Lubitz <holger@lubitz.org>
;* Mathieu Monnier <manao@melix.net>
+;* Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+pd_16: times 4 dd 16
+pd_32: times 4 dd 32
+pd_0f: times 4 dd 0xffff
+
+pad10: times 8 dw 10*PIXEL_MAX
+pad20: times 8 dw 20*PIXEL_MAX
+pad30: times 8 dw 30*PIXEL_MAX
+depad: times 4 dd 32*20*PIXEL_MAX + 512
+
+tap1: times 4 dw 1, -5
+tap2: times 4 dw 20, 20
+tap3: times 4 dw -5, 1
+
SECTION .text
+cextern pb_0
cextern pw_1
cextern pw_16
cextern pw_32
cextern pw_00ff
cextern pw_3fff
+cextern pw_pixel_max
cextern pd_128
%macro LOAD_ADD 4
paddw %4, %6
%endmacro
-%macro FILT_PACK 4
- paddw %1, %4
- paddw %2, %4
- psraw %1, %3
- psraw %2, %3
- packuswb %1, %2
+%macro FILT_PACK 4-6 b
+ paddw %1, %4
+ paddw %2, %4
+%if %0 == 6
+ psubusw %1, %6
+ psubusw %2, %6
+ psrlw %1, %3
+ psrlw %2, %3
+%else
+ psraw %1, %3
+ psraw %2, %3
+%endif
+%ifnidn w, %5
+ packuswb %1, %2
+%endif
%endmacro
;The hpel_filter routines use non-temporal writes for output.
;%define movntps movaps
;%define sfence
+%ifdef X264_HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, int stride, int width );
+;-----------------------------------------------------------------------------
+%macro HPEL_FILTER 1
+cglobal hpel_filter_v_%1, 5,6,11*(mmsize/16)
+%ifdef WIN64
+ movsxd r4, r4d
+%endif
+ FIX_STRIDES r3, r4
+ lea r5, [r1+r3]
+ sub r1, r3
+ sub r1, r3
+%if num_mmregs > 8
+ mova m8, [pad10]
+ mova m9, [pad20]
+ mova m10, [pad30]
+ %define s10 m8
+ %define s20 m9
+ %define s30 m10
+%else
+ %define s10 [pad10]
+ %define s20 [pad20]
+ %define s30 [pad30]
+%endif
+ add r0, r4
+ lea r2, [r2+r4]
+ neg r4
+ mova m7, [pw_pixel_max]
+ pxor m0, m0
+.loop:
+ mova m1, [r1]
+ mova m2, [r1+r3]
+ mova m3, [r1+r3*2]
+ mova m4, [r1+mmsize]
+ mova m5, [r1+r3+mmsize]
+ mova m6, [r1+r3*2+mmsize]
+ paddw m1, [r5+r3*2]
+ paddw m2, [r5+r3]
+ paddw m3, [r5]
+ paddw m4, [r5+r3*2+mmsize]
+ paddw m5, [r5+r3+mmsize]
+ paddw m6, [r5+mmsize]
+ add r1, 2*mmsize
+ add r5, 2*mmsize
+ FILT_V2 m1, m2, m3, m4, m5, m6
+ mova m6, [pw_16]
+ psubw m1, s20
+ psubw m4, s20
+ mova [r2+r4], m1
+ mova [r2+r4+mmsize], m4
+ paddw m1, s30
+ paddw m4, s30
+ add r4, 2*mmsize
+ FILT_PACK m1, m4, 5, m6, w, s10
+ CLIPW m1, m0, m7
+ CLIPW m4, m0, m7
+ mova [r0+r4-mmsize*2], m1
+ mova [r0+r4-mmsize*1], m4
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_c( uint16_t *dst, int16_t *buf, int width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_c_%1, 3,3,10*(mmsize/16)
+ add r2, r2
+ add r0, r2
+ lea r1, [r1+r2]
+ neg r2
+ mova m0, [tap1]
+ mova m7, [tap3]
+%if num_mmregs > 8
+ mova m8, [tap2]
+ mova m9, [depad]
+ %define s1 m8
+ %define s2 m9
+%else
+ %define s1 [tap2]
+ %define s2 [depad]
+%endif
+.loop:
+ movu m1, [r1+r2-4]
+ movu m2, [r1+r2-2]
+ mova m3, [r1+r2+0]
+ movu m4, [r1+r2+2]
+ movu m5, [r1+r2+4]
+ movu m6, [r1+r2+6]
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ pmaddwd m3, s1
+ pmaddwd m4, s1
+ pmaddwd m5, m7
+ pmaddwd m6, m7
+ paddd m1, s2
+ paddd m2, s2
+ paddd m3, m5
+ paddd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 10
+ psrad m2, 10
+ pslld m2, 16
+ pand m1, [pd_0f]
+ por m1, m2
+ CLIPW m1, [pb_0], [pw_pixel_max]
+ mova [r0+r2], m1
+ add r2, mmsize
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_h( uint16_t *dst, uint16_t *src, int width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_h_%1, 3,4,8*(mmsize/16)
+ %define src r1+r2
+ add r2, r2
+ add r0, r2
+ add r1, r2
+ neg r2
+ mova m0, [pw_pixel_max]
+.loop:
+ movu m1, [src-4]
+ movu m2, [src-2]
+ mova m3, [src+0]
+ movu m6, [src+2]
+ movu m4, [src+4]
+ movu m5, [src+6]
+ paddw m3, m6 ; c0
+ paddw m2, m4 ; b0
+ paddw m1, m5 ; a0
+%if mmsize == 16
+ movu m4, [src-4+mmsize]
+ movu m5, [src-2+mmsize]
+%endif
+ movu m7, [src+4+mmsize]
+ movu m6, [src+6+mmsize]
+ paddw m5, m7 ; b1
+ paddw m4, m6 ; a1
+ movu m7, [src+2+mmsize]
+ mova m6, [src+0+mmsize]
+ paddw m6, m7 ; c1
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ mova m7, [pw_1]
+ pxor m2, m2
+ add r2, mmsize*2
+ FILT_PACK m1, m4, 1, m7, w
+ CLIPW m1, m2, m0
+ CLIPW m4, m2, m0
+ mova [r0+r2-mmsize*2], m1
+ mova [r0+r2-mmsize*1], m4
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_MMX
+HPEL_FILTER mmxext
+INIT_XMM
+HPEL_FILTER sse2
+%endif ; X264_HIGH_BIT_DEPTH
+
+%ifndef X264_HIGH_BIT_DEPTH
INIT_MMX
%macro HPEL_V 1-2 0
movntps [r0+r2], m3
add r2, 16
jl .loop
- RET
+ REP_RET
%endif
%define PALIGNR PALIGNR_MMX
%undef movntq
%undef movntps
%undef sfence
+%endif ; !X264_HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void plane_copy_core( uint8_t *dst, int i_dst,
DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
#define MC_WEIGHT(w,type) \
- void x264_mc_weight_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int );
+ void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
#define MC_WEIGHT_OFFSET(w,type) \
void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
#undef MC_OFFSET
#undef MC_WEIGHT
-void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w4_mmx( pixel *, int, pixel *, int, int );
+void x264_mc_copy_w8_mmx( pixel *, int, pixel *, int, int );
+void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
+void x264_mc_copy_w8_aligned_sse2( pixel *, int, pixel *, int, int );
+void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
+void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_ref_mmxext( uint8_t *, int, int );
void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
#define MC_CHROMA(cpu)\
-void x264_mc_chroma_##cpu( uint8_t *dstu, uint8_t *dstv, int i_dst,\
- uint8_t *src, int i_src,\
+void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
+ pixel *src, int i_src,\
int dx, int dy, int i_width, int i_height );
MC_CHROMA(mmxext)
MC_CHROMA(sse2)
LOWRES(ssse3)
#define PIXEL_AVG_W(width,cpu)\
-void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int );
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
-PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
+PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
PIXEL_AVG_WALL(mmxext)
PIXEL_AVG_WALL(cache32_mmxext)
PIXEL_AVG_WALL(sse2_misalign)
PIXEL_AVG_WALL(cache64_ssse3)
-#if !X264_HIGH_BIT_DEPTH
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
-static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
+static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, int, pixel *, int, pixel *, int ) =\
{\
NULL,\
x264_pixel_avg2_w4_##name1,\
x264_pixel_avg2_w20_##name5,\
};
+#if X264_HIGH_BIT_DEPTH
+/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
+#define x264_pixel_avg2_w12_mmxext x264_pixel_avg2_w10_mmxext
+#define x264_pixel_avg2_w20_mmxext x264_pixel_avg2_w18_mmxext
+#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w10_sse2
+#define x264_pixel_avg2_w20_sse2 x264_pixel_avg2_w18_sse2
+#else
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2
+#endif // X264_HIGH_BIT_DEPTH
PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
+#if X264_HIGH_BIT_DEPTH
+PIXEL_AVG_WTAB(sse2, mmxext, sse2, sse2, sse2, sse2)
+#else // !X264_HIGH_BIT_DEPTH
#if ARCH_X86
PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_ssse3, cache64_ssse3, cache64_sse2)
+#endif // X264_HIGH_BIT_DEPTH
#define MC_COPY_WTAB(instr, name1, name2, name3)\
-static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
+static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int, int ) =\
{\
NULL,\
x264_mc_copy_w4_##name1,\
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
- static void (* x264_mc_##function##_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
+ static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
{\
x264_mc_##function##_w4_##name1,\
x264_mc_##function##_w4_##name1,\
x264_mc_##function##_w20_##instr,\
};
+#if X264_HIGH_BIT_DEPTH
+MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
+MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
+#else
MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
w->cacheb[i] = w->i_offset;
}
}
+#endif // !X264_HIGH_BIT_DEPTH
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
#define MC_LUMA(name,instr1,instr2)\
-static void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
- uint8_t *src[4], int i_src_stride,\
+static void mc_luma_##name( pixel *dst, int i_dst_stride,\
+ pixel *src[4], int i_src_stride,\
int mvx, int mvy,\
int i_width, int i_height, const x264_weight_t *weight )\
{\
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+ pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
if( qpel_idx & 5 ) /* qpel interpolation needed */\
{\
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+ pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
x264_pixel_avg_wtab_##instr1[i_width>>2](\
dst, i_dst_stride, src1, i_src_stride,\
src2, i_height );\
}
MC_LUMA(mmxext,mmxext,mmx)
+MC_LUMA(sse2,sse2,sse2)
+#if !X264_HIGH_BIT_DEPTH
#if ARCH_X86
MC_LUMA(cache32_mmxext,cache32_mmxext,mmx)
MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
-MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
+#endif // !X264_HIGH_BIT_DEPTH
#define GET_REF(name)\
-static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
- uint8_t *src[4], int i_src_stride,\
+static pixel *get_ref_##name( pixel *dst, int *i_dst_stride,\
+ pixel *src[4], int i_src_stride,\
int mvx, int mvy,\
int i_width, int i_height, const x264_weight_t *weight )\
{\
int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
- uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+ pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
if( qpel_idx & 5 ) /* qpel interpolation needed */\
{\
- uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+ pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
x264_pixel_avg_wtab_##name[i_width>>2](\
dst, *i_dst_stride, src1, i_src_stride,\
src2, i_height );\
}
GET_REF(mmxext)
+GET_REF(sse2)
+#if !X264_HIGH_BIT_DEPTH
#if ARCH_X86
GET_REF(cache32_mmxext)
GET_REF(cache64_mmxext)
#endif
-GET_REF(sse2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
+#endif // !X264_HIGH_BIT_DEPTH
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
-void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
-void x264_hpel_filter_c_##cpuc( uint8_t *dst, int16_t *buf, int width );\
-void x264_hpel_filter_h_##cpuh( uint8_t *dst, uint8_t *src, int width );\
-static void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,\
+void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, int stride, int width);\
+void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, int width );\
+void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, int width );\
+static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
int stride, int width, int height, int16_t *buf )\
{\
int realign = (intptr_t)src & (align-1);\
}
HPEL(8, mmxext, mmxext, mmxext, mmxext)
+#if X264_HIGH_BIT_DEPTH
+HPEL(16, sse2, sse2, sse2, sse2 )
+#else // !X264_HIGH_BIT_DEPTH
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
#if ARCH_X86_64
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
PLANE_INTERLEAVE(mmxext)
PLANE_INTERLEAVE(sse2)
-#endif // !X264_HIGH_BIT_DEPTH
+#endif // X264_HIGH_BIT_DEPTH
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
return;
- pf->memcpy_aligned = x264_memcpy_aligned_mmx;
- pf->memzero_aligned = x264_memzero_aligned_mmx;
-#if !X264_HIGH_BIT_DEPTH
pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
+ pf->memcpy_aligned = x264_memcpy_aligned_mmx;
+ pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
-#endif // !X264_HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_MMXEXT) )
return;
-#if !X264_HIGH_BIT_DEPTH
pf->mc_luma = mc_luma_mmxext;
pf->get_ref = get_ref_mmxext;
pf->mc_chroma = x264_mc_chroma_mmxext;
-
+ pf->hpel_filter = x264_hpel_filter_mmxext;
pf->weight = x264_mc_weight_wtab_mmxext;
+
+#if X264_HIGH_BIT_DEPTH
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+
+ if( cpu&X264_CPU_SSE2_IS_FAST )
+ {
+ pf->get_ref = get_ref_sse2;
+ pf->mc_luma = mc_luma_sse2;
+ pf->hpel_filter = x264_hpel_filter_sse2;
+ }
+
+ pf->memcpy_aligned = x264_memcpy_aligned_sse2;
+ pf->memzero_aligned = x264_memzero_aligned_sse2;
+ pf->integral_init4v = x264_integral_init4v_sse2;
+ pf->integral_init8v = x264_integral_init8v_sse2;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
+
+ if( cpu&X264_CPU_SSE2_IS_SLOW )
+ return;
+
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
+ pf->weight = x264_mc_weight_wtab_sse2;
+
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_sse2;
+
+ if( !(cpu&X264_CPU_SSSE3) )
+ return;
+
+ if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
+ pf->integral_init4v = x264_integral_init4v_ssse3;
+#else // !X264_HIGH_BIT_DEPTH
pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
pf->weight_cache = x264_weight_cache_mmxext;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
- pf->hpel_filter = x264_hpel_filter_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
}
#endif
-#endif // !X264_HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_sse2;
pf->memzero_aligned = x264_memzero_aligned_sse2;
-#if !X264_HIGH_BIT_DEPTH
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->integral_init4h = x264_integral_init4h_sse4;
pf->integral_init8h = x264_integral_init8h_sse4;
-#endif // !X264_HIGH_BIT_DEPTH
+#endif // X264_HIGH_BIT_DEPTH
}