From: Janne Grunau Date: Wed, 19 Mar 2014 12:48:21 +0000 (+0100) Subject: aarch64: pixel metrics NEON asm X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=f4a82a54885f3dad7106a6855eaef50ea085b27e;p=x264 aarch64: pixel metrics NEON asm Ported from the ARM NEON asm. --- diff --git a/Makefile b/Makefile index 8bc7f9a8..d68d3d8a 100644 --- a/Makefile +++ b/Makefile @@ -126,7 +126,7 @@ endif # AArch64 NEON optims ifeq ($(ARCH),AARCH64) ifneq ($(AS),) -ASMSRC += +ASMSRC += common/aarch64/pixel-a.S SRCS += OBJASM = $(ASMSRC:%.S=%.o) endif diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S index b6c31930..cffa8f7d 100644 --- a/common/aarch64/asm.S +++ b/common/aarch64/asm.S @@ -97,3 +97,17 @@ MACH .const_data #define GLUE(a, b) a ## b #define JOIN(a, b) GLUE(a, b) #define X(s) JOIN(EXTERN_ASM, s) + +#define FDEC_STRIDE 32 +#define FENC_STRIDE 16 + + +.macro SUMSUB_AB sum, sub, a, b + add \sum, \a, \b + sub \sub, \a, \b +.endm + +.macro transpose t1, t2, s1, s2 + trn1 \t1, \s1, \s2 + trn2 \t2, \s1, \s2 +.endm diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S new file mode 100644 index 00000000..07e9a610 --- /dev/null +++ b/common/aarch64/pixel-a.S @@ -0,0 +1,1153 @@ +/***************************************************************************** + * pixel.S: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * Janne Grunau + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +const mask +.rept 16 +.byte 0xff +.endr +.rept 16 +.byte 0x00 +.endr +endconst + +const mask_ac_4_8 +.short 0, -1, -1, -1, 0, -1, -1, -1 +.short 0, -1, -1, -1, -1, -1, -1, -1 +endconst + +.macro SAD_START_4 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + uabdl v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_4 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + uabal v16.8h, v0.8b, v1.8b +.endm + +.macro SAD_START_8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + uabdl v16.8h, v0.8b, v1.8b + uabdl v17.8h, v2.8b, v3.8b +.endm + +.macro SAD_8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + uabal v16.8h, v0.8b, v1.8b + uabal v17.8h, v2.8b, v3.8b +.endm + +.macro SAD_START_16 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + uabdl v16.8h, v0.8b, v1.8b + uabdl2 v17.8h, v0.16b, v1.16b + uabal v16.8h, v2.8b, v3.8b + uabal2 v17.8h, v2.16b, v3.16b +.endm + +.macro SAD_16 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + uabal v16.8h, v0.8b, v1.8b + uabal2 v17.8h, v0.16b, v1.16b + uabal v16.8h, v2.8b, v3.8b + uabal2 v17.8h, v2.16b, v3.16b +.endm + +.macro SAD_FUNC w, h, name +function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1 + SAD_START_\w + +.rept \h / 2 - 1 + SAD_\w +.endr +.if \w > 4 + add v16.8h, v16.8h, v17.8h +.endif + uaddlv s0, v16.8h + fmov w0, s0 + ret +endfunc +.endm + +SAD_FUNC 4, 4 +SAD_FUNC 4, 8 +SAD_FUNC 8, 4 +SAD_FUNC 8, 8 +SAD_FUNC 8, 16 +SAD_FUNC 16, 8 +SAD_FUNC 16, 16 + +.macro SAD_X_4 x, first=uabal + ld1 {v0.s}[0], [x0], x7 + ld1 {v1.s}[0], [x1], x5 + ld1 {v0.s}[1], [x0], x7 + ld1 {v1.s}[1], [x1], x5 + \first v16.8h, v1.8b, v0.8b + ld1 {v2.s}[0], [x2], x5 + ld1 {v2.s}[1], [x2], x5 + \first v17.8h, v2.8b, v0.8b + ld1 {v3.s}[0], [x3], x5 + ld1 {v3.s}[1], [x3], x5 + \first v18.8h, v3.8b, v0.8b +.if \x == 4 + ld1 {v4.s}[0], [x4], x5 + ld1 {v4.s}[1], [x4], x5 + \first v19.8h, v4.8b, v0.8b +.endif +.endm + +.macro SAD_X_8 x, first=uabal + ld1 {v0.8b}, [x0], x7 + ld1 {v1.8b}, [x1], x5 + \first v16.8h, v1.8b, v0.8b + ld1 {v2.8b}, [x2], x5 + ld1 {v5.8b}, [x0], x7 + \first v17.8h, v2.8b, v0.8b + ld1 {v3.8b}, [x3], x5 + ld1 {v1.8b}, [x1], x5 + \first v18.8h, v3.8b, v0.8b + uabal v16.8h, v1.8b, v5.8b + ld1 {v2.8b}, [x2], x5 + ld1 {v3.8b}, [x3], x5 + uabal v17.8h, v2.8b, v5.8b + uabal v18.8h, v3.8b, v5.8b +.if \x == 4 + ld1 {v4.8b}, [x4], x5 + \first v19.8h, v4.8b, v0.8b + ld1 {v4.8b}, [x4], x5 + uabal v19.8h, v4.8b, v5.8b +.endif +.endm + +.macro SAD_X_16 x, first=uabal + ld1 {v0.16b}, [x0], x7 + ld1 {v1.16b}, [x1], x5 + \first v16.8h, v1.8b, v0.8b + \first\()2 v20.8h, v1.16b, v0.16b + ld1 {v2.16b}, [x2], x5 + ld1 {v5.16b}, [x0], x7 + \first v17.8h, v2.8b, v0.8b + \first\()2 v21.8h, v2.16b, v0.16b + ld1 {v3.16b}, [x3], x5 + ld1 {v1.16b}, [x1], x5 + \first v18.8h, v3.8b, v0.8b + \first\()2 v22.8h, v3.16b, v0.16b + uabal v16.8h, v1.8b, v5.8b + uabal2 v20.8h, v1.16b, v5.16b + ld1 {v2.16b}, [x2], x5 + ld1 {v3.16b}, [x3], x5 + uabal v17.8h, v2.8b, v5.8b + uabal2 v21.8h, v2.16b, v5.16b + uabal v18.8h, v3.8b, v5.8b + uabal2 v22.8h, v3.16b, v5.16b +.if \x == 4 + ld1 {v4.16b}, [x4], x5 + \first v19.8h, v4.8b, v0.8b + \first\()2 v23.8h, v4.16b, v0.16b + ld1 {v4.16b}, [x4], x5 + uabal v19.8h, v4.8b, v5.8b + uabal2 v23.8h, v4.16b, v5.16b +.endif +.endm + +.macro SAD_X_FUNC x, w, h +function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 +.if \x == 3 + mov x6, x5 + mov x5, x4 +.endif + mov x7, #FENC_STRIDE + + SAD_X_\w \x, uabdl + +.rept \h / 2 - 1 + SAD_X_\w \x +.endr + +.if \w > 8 + add v16.8h, v16.8h, v20.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h +.if \x == 4 + add v19.8h, v19.8h, v23.8h +.endif +.endif +// add up the sads + uaddlv s0, v16.8h + uaddlv s1, v17.8h + uaddlv s2, v18.8h + + stp s0, s1, [x6], #8 +.if \x == 3 + str s2, [x6] +.else + uaddlv s3, v19.8h + stp s2, s3, [x6] +.endif + ret +endfunc +.endm + +SAD_X_FUNC 3, 4, 4 +SAD_X_FUNC 3, 4, 8 +SAD_X_FUNC 3, 8, 4 +SAD_X_FUNC 3, 8, 8 +SAD_X_FUNC 3, 8, 16 +SAD_X_FUNC 3, 16, 8 +SAD_X_FUNC 3, 16, 16 + +SAD_X_FUNC 4, 4, 4 +SAD_X_FUNC 4, 4, 8 +SAD_X_FUNC 4, 8, 4 +SAD_X_FUNC 4, 8, 8 +SAD_X_FUNC 4, 8, 16 +SAD_X_FUNC 4, 16, 8 +SAD_X_FUNC 4, 16, 16 + + +.macro SSD_START_4 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + smull v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_4 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_END_4 + usubl v2.8h, v16.8b, v17.8b + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_START_8 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x2], x3 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.8b}, [x0], x1 + smull v0.4s, v2.4h, v2.4h + ld1 {v17.8b}, [x2], x3 + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_8 + usubl v2.8h, v16.8b, v17.8b + ld1 {v16.8b}, [x0], x1 + smlal v0.4s, v2.4h, v2.4h + ld1 {v17.8b}, [x2], x3 + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_END_8 + usubl v2.8h, v16.8b, v17.8b + smlal v0.4s, v2.4h, v2.4h + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_START_16 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x2], x3 + usubl v2.8h, v16.8b, v17.8b + usubl2 v3.8h, v16.16b, v17.16b + ld1 {v16.16b}, [x0], x1 + smull v0.4s, v2.4h, v2.4h + smull2 v1.4s, v2.8h, v2.8h + ld1 {v17.16b}, [x2], x3 + smlal v0.4s, v3.4h, v3.4h + smlal2 v1.4s, v3.8h, v3.8h +.endm + +.macro SSD_16 + usubl v2.8h, v16.8b, v17.8b + usubl2 v3.8h, v16.16b, v17.16b + ld1 {v16.16b}, [x0], x1 + smlal v0.4s, v2.4h, v2.4h + smlal2 v1.4s, v2.8h, v2.8h + ld1 {v17.16b}, [x2], x3 + smlal v0.4s, v3.4h, v3.4h + smlal2 v1.4s, v3.8h, v3.8h +.endm + +.macro SSD_END_16 + usubl v2.8h, v16.8b, v17.8b + usubl2 v3.8h, v16.16b, v17.16b + smlal v0.4s, v2.4h, v2.4h + smlal2 v1.4s, v2.8h, v2.8h + smlal v0.4s, v3.4h, v3.4h + smlal2 v1.4s, v3.8h, v3.8h + add v0.4s, v0.4s, v1.4s +.endm + +.macro SSD_FUNC w h +function x264_pixel_ssd_\w\()x\h\()_neon, export=1 + SSD_START_\w +.rept \h-2 + SSD_\w +.endr + SSD_END_\w + + addv s0, v0.4s + mov w0, v0.s[0] + ret +endfunc +.endm + +SSD_FUNC 4, 4 +SSD_FUNC 4, 8 +SSD_FUNC 8, 4 +SSD_FUNC 8, 8 +SSD_FUNC 8, 16 +SSD_FUNC 16, 8 +SSD_FUNC 16, 16 + +.macro pixel_var_8 h +function x264_pixel_var_8x\h\()_neon, export=1 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + mov x2, \h - 4 + umull v1.8h, v16.8b, v16.8b + uxtl v0.8h, v16.8b + umull v2.8h, v17.8b, v17.8b + uaddw v0.8h, v0.8h, v17.8b + ld1 {v18.8b}, [x0], x1 + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + ld1 {v19.8b}, [x0], x1 + +1: subs x2, x2, #4 + uaddw v0.8h, v0.8h, v18.8b + umull v24.8h, v18.8b, v18.8b + ld1 {v20.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v19.8b + umull v25.8h, v19.8b, v19.8b + uadalp v1.4s, v24.8h + ld1 {v21.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v20.8b + umull v26.8h, v20.8b, v20.8b + uadalp v2.4s, v25.8h + ld1 {v18.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v21.8b + umull v27.8h, v21.8b, v21.8b + uadalp v1.4s, v26.8h + ld1 {v19.8b}, [x0], x1 + uadalp v2.4s, v27.8h + b.gt 1b + + uaddw v0.8h, v0.8h, v18.8b + umull v28.8h, v18.8b, v18.8b + uaddw v0.8h, v0.8h, v19.8b + umull v29.8h, v19.8b, v19.8b + uadalp v1.4s, v28.8h + uadalp v2.4s, v29.8h + + b x264_var_end +endfunc +.endm + +pixel_var_8 8 +pixel_var_8 16 + +function x264_pixel_var_16x16_neon, export=1 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x0], x1 + mov x2, #14 + umull v1.8h, v16.8b, v16.8b + umull2 v2.8h, v16.16b, v16.16b + uxtl v0.8h, v16.8b + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + uaddw2 v0.8h, v0.8h, v16.16b + +1: subs x2, x2, #2 + ld1 {v18.16b}, [x0], x1 + uaddw v0.8h, v0.8h, v17.8b + umull v3.8h, v17.8b, v17.8b + uaddw2 v0.8h, v0.8h, v17.16b + umull2 v4.8h, v17.16b, v17.16b + uadalp v1.4s, v3.8h + uadalp v2.4s, v4.8h + + ld1 {v17.16b}, [x0], x1 + uaddw v0.8h, v0.8h, v18.8b + umull v5.8h, v18.8b, v18.8b + uaddw2 v0.8h, v0.8h, v18.16b + umull2 v6.8h, v18.16b, v18.16b + uadalp v1.4s, v5.8h + uadalp v2.4s, v6.8h + b.gt 1b + + uaddw v0.8h, v0.8h, v17.8b + umull v3.8h, v17.8b, v17.8b + uaddw2 v0.8h, v0.8h, v17.16b + umull2 v4.8h, v17.16b, v17.16b + uadalp v1.4s, v3.8h + uadalp v2.4s, v4.8h +endfunc + +function x264_var_end + add v1.4s, v1.4s, v2.4s + uaddlv s0, v0.8h + uaddlv d1, v1.4s + mov w0, v0.s[0] + mov x1, v1.d[0] + orr x0, x0, x1, lsl #32 + ret +endfunc + + +.macro pixel_var2_8 h +function x264_pixel_var2_8x\h\()_neon, export=1 + ld1 {v16.8b}, [x0], x1 + ld1 {v18.8b}, [x2], x3 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x2], x3 + mov x5, \h - 4 + usubl v6.8h, v16.8b, v18.8b + usubl v7.8h, v17.8b, v19.8b + ld1 {v16.8b}, [x0], x1 + ld1 {v18.8b}, [x2], x3 + smull v2.4s, v6.4h, v6.4h + smull2 v3.4s, v6.8h, v6.8h + add v0.8h, v6.8h, v7.8h + smlal v2.4s, v7.4h, v7.4h + smlal2 v3.4s, v7.8h, v7.8h + + usubl v6.8h, v16.8b, v18.8b + +1: subs x5, x5, #2 + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x2], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + usubl v7.8h, v17.8b, v19.8b + add v0.8h, v0.8h, v6.8h + ld1 {v16.8b}, [x0], x1 + ld1 {v18.8b}, [x2], x3 + smlal v2.4s, v7.4h, v7.4h + smlal2 v3.4s, v7.8h, v7.8h + usubl v6.8h, v16.8b, v18.8b + add v0.8h, v0.8h, v7.8h + b.gt 1b + + ld1 {v17.8b}, [x0], x1 + ld1 {v19.8b}, [x2], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + usubl v7.8h, v17.8b, v19.8b + add v0.8h, v0.8h, v6.8h + smlal v2.4s, v7.4h, v7.4h + add v0.8h, v0.8h, v7.8h + smlal2 v3.4s, v7.8h, v7.8h + + saddlv s0, v0.8h + add v2.4s, v2.4s, v3.4s + mov w0, v0.s[0] + addv s1, v2.4s + sxtw x0, w0 + mov w1, v1.s[0] + mul x0, x0, x0 + str w1, [x4] + sub x0, x1, x0, lsr # 6 + (\h >> 4) + + ret +endfunc +.endm + +pixel_var2_8 8 +pixel_var2_8 16 + + +function x264_pixel_satd_4x4_neon, export=1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + + usubl v0.8h, v0.8b, v1.8b + usubl v1.8h, v2.8b, v3.8b + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + zip1 v0.2d, v2.2d, v3.2d + zip2 v1.2d, v2.2d, v3.2d + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + trn1 v0.8h, v2.8h, v3.8h + trn2 v1.8h, v2.8h, v3.8h + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + trn1 v0.4s, v2.4s, v3.4s + trn2 v1.4s, v2.4s, v3.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + umax v0.8h, v0.8h, v1.8h + + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret +endfunc + +function x264_pixel_satd_4x8_neon, export=1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + b x264_satd_4x8_8x4_end_neon +endfunc + +function x264_pixel_satd_8x4_neon, export=1 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 +endfunc + +function x264_satd_4x8_8x4_end_neon + usubl v0.8h, v0.8b, v1.8b + usubl v1.8h, v2.8b, v3.8b + usubl v2.8h, v4.8b, v5.8b + usubl v3.8h, v6.8b, v7.8b + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + trn1 v0.8h, v4.8h, v5.8h + trn2 v1.8h, v4.8h, v5.8h + trn1 v2.8h, v6.8h, v7.8h + trn2 v3.8h, v6.8h, v7.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + + trn1 v0.4s, v16.4s, v18.4s + trn2 v1.4s, v16.4s, v18.4s + trn1 v2.4s, v17.4s, v19.4s + trn2 v3.4s, v17.4s, v19.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + umax v0.8h, v0.8h, v1.8h + umax v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret +endfunc + +function x264_pixel_satd_8x8_neon, export=1 + mov x4, x30 + + bl x264_satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function x264_pixel_satd_8x16_neon, export=1 + mov x4, x30 + + bl x264_satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v0.8h, v1.8h + + bl x264_satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v31.8h, v0.8h, v1.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +.macro SUMSUBL_AB sum, sub, a, b + uaddl \sum, \a, \b + usubl \sub, \a, \b +.endm + +.macro load_diff_fly_8x8 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + usubl v16.8h, v0.8b, v1.8b + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + usubl v17.8h, v2.8b, v3.8b + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 + usubl v18.8h, v4.8b, v5.8b + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + usubl v19.8h, v6.8b, v7.8b + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + usubl v20.8h, v0.8b, v1.8b + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + usubl v21.8h, v2.8b, v3.8b + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + usubl v22.8h, v4.8b, v5.8b + usubl v23.8h, v6.8b, v7.8b +.endm + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d + SUMSUB_AB \s1, \d1, \a, \b + SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 + SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 + SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 +.endm + +function x264_satd_8x8_neon + load_diff_fly_8x8 +endfunc + +// one vertical hadamard pass and two horizontal +function x264_satd_8x4v_8x8h_neon + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h + + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + ret +endfunc + +function x264_pixel_satd_16x8_neon, export=1 + mov x4, x30 + + bl x264_satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function x264_pixel_satd_16x16_neon, export=1 + mov x4, x30 + + bl x264_satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + bl x264_satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function x264_satd_16x4_neon + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 + usubl v16.8h, v0.8b, v1.8b + usubl2 v20.8h, v0.16b, v1.16b + ld1 {v5.16b}, [x2], x3 + ld1 {v4.16b}, [x0], x1 + usubl v17.8h, v2.8b, v3.8b + usubl2 v21.8h, v2.16b, v3.16b + ld1 {v7.16b}, [x2], x3 + ld1 {v6.16b}, [x0], x1 + + usubl v18.8h, v4.8b, v5.8b + usubl2 v22.8h, v4.16b, v5.16b + usubl v19.8h, v6.8b, v7.8b + usubl2 v23.8h, v6.16b, v7.16b + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + b x264_satd_8x4v_8x8h_neon +endfunc + + +function x264_pixel_sa8d_8x8_neon, export=1 + mov x4, x30 + bl x264_sa8d_8x8_neon + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +function x264_pixel_sa8d_16x16_neon, export=1 + mov x4, x30 + bl x264_sa8d_8x8_neon + uaddlp v30.4s, v0.8h + uaddlp v31.4s, v1.8h + bl x264_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + sub x0, x0, x1, lsl #4 + sub x2, x2, x3, lsl #4 + add x0, x0, #8 + add x2, x2, #8 + bl x264_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + bl x264_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + add v0.4s, v30.4s, v31.4s + addv s0, v0.4s + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +function x264_sa8d_8x8_neon + load_diff_fly_8x8 + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h + SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h + SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h + SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h + + transpose v20.8h, v21.8h, v16.8h, v17.8h + transpose v4.8h, v5.8h, v0.8h, v1.8h + transpose v22.8h, v23.8h, v18.8h, v19.8h + transpose v6.8h, v7.8h, v2.8h, v3.8h + + SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h + SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h + SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h + SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h + + transpose v20.4s, v22.4s, v28.4s, v0.4s + transpose v21.4s, v23.4s, v29.4s, v1.4s + transpose v16.4s, v18.4s, v24.4s, v26.4s + transpose v17.4s, v19.4s, v25.4s, v27.4s + + SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h + SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + transpose v16.2d, v20.2d, v0.2d, v4.2d + transpose v17.2d, v21.2d, v1.2d, v5.2d + transpose v18.2d, v22.2d, v2.2d, v6.2d + transpose v19.2d, v23.2d, v3.2d, v7.2d + + abs v16.8h, v16.8h + abs v20.8h, v20.8h + abs v17.8h, v17.8h + abs v21.8h, v21.8h + abs v18.8h, v18.8h + abs v22.8h, v22.8h + abs v19.8h, v19.8h + abs v23.8h, v23.8h + + umax v16.8h, v16.8h, v20.8h + umax v17.8h, v17.8h, v21.8h + umax v18.8h, v18.8h, v22.8h + umax v19.8h, v19.8h, v23.8h + + add v0.8h, v16.8h, v17.8h + add v1.8h, v18.8h, v19.8h + + ret +endfunc + + +.macro HADAMARD_AC w h +function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1 + movrel x5, mask_ac_4_8 + mov x4, x30 + ld1 {v30.8h,v31.8h}, [x5] + movi v28.16b, #0 + movi v29.16b, #0 + + bl x264_hadamard_ac_8x8_neon +.if \h > 8 + bl x264_hadamard_ac_8x8_neon +.endif +.if \w > 8 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + bl x264_hadamard_ac_8x8_neon +.endif +.if \w * \h == 256 + sub x0, x0, x1, lsl #4 + bl x264_hadamard_ac_8x8_neon +.endif + + addv s1, v29.4s + addv s0, v28.4s + mov w1, v1.s[0] + mov w0, v0.s[0] + lsr w1, w1, #2 + lsr w0, w0, #1 + orr x0, x0, x1, lsl #32 + ret x4 +endfunc +.endm + +HADAMARD_AC 8, 8 +HADAMARD_AC 8, 16 +HADAMARD_AC 16, 8 +HADAMARD_AC 16, 16 + +// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 +function x264_hadamard_ac_8x8_neon + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + ld1 {v18.8b}, [x0], x1 + ld1 {v19.8b}, [x0], x1 + SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b + ld1 {v20.8b}, [x0], x1 + ld1 {v21.8b}, [x0], x1 + SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b + ld1 {v22.8b}, [x0], x1 + ld1 {v23.8b}, [x0], x1 + SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b + SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b + + SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + abs v0.8h, v16.8h + abs v4.8h, v20.8h + abs v1.8h, v17.8h + abs v5.8h, v21.8h + abs v2.8h, v18.8h + abs v6.8h, v22.8h + abs v3.8h, v19.8h + abs v7.8h, v23.8h + + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + and v0.16b, v0.16b, v30.16b + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + uadalp v28.4s, v0.8h + uadalp v28.4s, v1.8h + + SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h + SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h + SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h + SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h + + transpose v16.2d, v17.2d, v6.2d, v7.2d + transpose v18.2d, v19.2d, v4.2d, v5.2d + transpose v20.2d, v21.2d, v2.2d, v3.2d + + abs v16.8h, v16.8h + abs v17.8h, v17.8h + abs v18.8h, v18.8h + abs v19.8h, v19.8h + abs v20.8h, v20.8h + abs v21.8h, v21.8h + + transpose v7.2d, v6.2d, v1.2d, v0.2d + + umax v3.8h, v16.8h, v17.8h + umax v2.8h, v18.8h, v19.8h + umax v1.8h, v20.8h, v21.8h + + SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h + + add v2.8h, v2.8h, v3.8h + add v2.8h, v2.8h, v1.8h + and v4.16b, v4.16b, v31.16b + add v2.8h, v2.8h, v2.8h + abs v5.8h, v5.8h + abs v4.8h, v4.8h + add v2.8h, v2.8h, v5.8h + add v2.8h, v2.8h, v4.8h + uadalp v29.4s, v2.8h + ret +endfunc + + +function x264_pixel_ssim_4x4x2_core_neon, export=1 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x2], x3 + umull v16.8h, v0.8b, v0.8b + umull v17.8h, v0.8b, v2.8b + umull v18.8h, v2.8b, v2.8b + + ld1 {v28.8b}, [x0], x1 + ld1 {v29.8b}, [x2], x3 + umull v20.8h, v28.8b, v28.8b + umull v21.8h, v28.8b, v29.8b + umull v22.8h, v29.8b, v29.8b + + uaddlp v16.4s, v16.8h + uaddlp v17.4s, v17.8h + uaddl v0.8h, v0.8b, v28.8b + uadalp v16.4s, v18.8h + uaddl v1.8h, v2.8b, v29.8b + + ld1 {v26.8b}, [x0], x1 + ld1 {v27.8b}, [x2], x3 + umull v23.8h, v26.8b, v26.8b + umull v24.8h, v26.8b, v27.8b + umull v25.8h, v27.8b, v27.8b + + uadalp v16.4s, v20.8h + uaddw v0.8h, v0.8h, v26.8b + uadalp v17.4s, v21.8h + uaddw v1.8h, v1.8h, v27.8b + uadalp v16.4s, v22.8h + + ld1 {v28.8b}, [x0], x1 + ld1 {v29.8b}, [x2], x3 + umull v20.8h, v28.8b, v28.8b + umull v21.8h, v28.8b, v29.8b + umull v22.8h, v29.8b, v29.8b + + uadalp v16.4s, v23.8h + uaddw v0.8h, v0.8h, v28.8b + uadalp v17.4s, v24.8h + uaddw v1.8h, v1.8h, v29.8b + uadalp v16.4s, v25.8h + + uadalp v16.4s, v20.8h + uadalp v17.4s, v21.8h + uadalp v16.4s, v22.8h + + uaddlp v0.4s, v0.8h + uaddlp v1.4s, v1.8h + + addp v0.4s, v0.4s, v0.4s + addp v1.4s, v1.4s, v1.4s + addp v2.4s, v16.4s, v16.4s + addp v3.4s, v17.4s, v17.4s + + st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4] + ret +endfunc + +function x264_pixel_ssim_end4_neon, export=1 + mov x5, #4 + ld1 {v16.4s,v17.4s}, [x0], #32 + ld1 {v18.4s,v19.4s}, [x1], #32 + mov w4, #0x99bb + subs x2, x5, w2, uxtw + mov w3, #416 // ssim_c1 = .01*.01*255*255*64 + movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63 + add v0.4s, v16.4s, v18.4s + add v1.4s, v17.4s, v19.4s + add v0.4s, v0.4s, v1.4s + ld1 {v20.4s,v21.4s}, [x0], #32 + ld1 {v22.4s,v23.4s}, [x1], #32 + add v2.4s, v20.4s, v22.4s + add v3.4s, v21.4s, v23.4s + add v1.4s, v1.4s, v2.4s + ld1 {v16.4s}, [x0], #16 + ld1 {v18.4s}, [x1], #16 + add v16.4s, v16.4s, v18.4s + add v2.4s, v2.4s, v3.4s + add v3.4s, v3.4s, v16.4s + + dup v30.4s, w3 + dup v31.4s, w4 + + transpose v4.4s, v5.4s, v0.4s, v1.4s + transpose v6.4s, v7.4s, v2.4s, v3.4s + transpose v0.2d, v2.2d, v4.2d, v6.2d + transpose v1.2d, v3.2d, v5.2d, v7.2d + + mul v16.4s, v0.4s, v1.4s // s1*s2 + mul v0.4s, v0.4s, v0.4s + mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 + + shl v3.4s, v3.4s, #7 + shl v2.4s, v2.4s, #6 + add v1.4s, v16.4s, v16.4s + + sub v2.4s, v2.4s, v0.4s // vars + sub v3.4s, v3.4s, v1.4s // covar*2 + add v0.4s, v0.4s, v30.4s + add v2.4s, v2.4s, v31.4s + add v1.4s, v1.4s, v30.4s + add v3.4s, v3.4s, v31.4s + + scvtf v0.4s, v0.4s + scvtf v2.4s, v2.4s + scvtf v1.4s, v1.4s + scvtf v3.4s, v3.4s + + fmul v0.4s, v0.4s, v2.4s + fmul v1.4s, v1.4s, v3.4s + + fdiv v0.4s, v1.4s, v0.4s + + b.eq 1f + movrel x3, mask + add x3, x3, x2, lsl #2 + ld1 {v29.4s}, [x3] + and v0.16b, v0.16b, v29.16b +1: + faddp v0.4s, v0.4s, v0.4s + faddp s0, v0.2s + ret +endfunc diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h new file mode 100644 index 00000000..9c7768c4 --- /dev/null +++ b/common/aarch64/pixel.h @@ -0,0 +1,69 @@ +/***************************************************************************** + * pixel.h: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2014 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_PIXEL_H +#define X264_AARCH64_PIXEL_H + +#define DECL_PIXELS( ret, name, suffix, args ) \ + ret x264_pixel_##name##_16x16_##suffix args;\ + ret x264_pixel_##name##_16x8_##suffix args;\ + ret x264_pixel_##name##_8x16_##suffix args;\ + ret x264_pixel_##name##_8x8_##suffix args;\ + ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x8_##suffix args;\ + ret x264_pixel_##name##_4x4_##suffix args;\ + +#define DECL_X1( name, suffix ) \ + DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) ) + +#define DECL_X4( name, suffix ) \ + DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\ + DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) ) + +DECL_X1( sad, neon ) +DECL_X4( sad, neon ) +DECL_X1( satd, neon ) +DECL_X1( ssd, neon ) + +int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); + +uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); +int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); + +uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t ); + +void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, + const uint8_t *, intptr_t, + int sums[2][4] ); +float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); + +#endif diff --git a/common/pixel.c b/common/pixel.c index a06f5dbe..5524af73 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -38,6 +38,9 @@ # include "arm/pixel.h" # include "arm/predict.h" #endif +#if ARCH_AARCH64 +# include "aarch64/pixel.h" +#endif /**************************************************************************** @@ -496,7 +499,7 @@ SATD_X_DECL7( _xop ) #endif #if !HIGH_BIT_DEPTH -#if HAVE_ARMV6 +#if HAVE_ARMV6 || ARCH_AARCH64 SATD_X_DECL7( _neon ) #endif #endif // !HIGH_BIT_DEPTH @@ -524,6 +527,10 @@ INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) INTRA_MBCMP_8x8( sad, _neon, _neon ) INTRA_MBCMP_8x8(sa8d, _neon, _neon ) #endif +#if !HIGH_BIT_DEPTH && ARCH_AARCH64 +INTRA_MBCMP_8x8( sad, _neon, _c ) +INTRA_MBCMP_8x8(sa8d, _neon, _c ) +#endif #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\ void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ @@ -589,6 +596,16 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) #endif +#if !HIGH_BIT_DEPTH && ARCH_AARCH64 +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c ) +INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _c ) +INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _c ) +INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _c ) +#endif // No C implementation of intra_satd_x9. See checkasm for its behavior, // or see x264_mb_analyse_intra for the entirely different algorithm we @@ -1390,6 +1407,46 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) } } #endif + +#if ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + { + INIT7( sad, _neon ); + // AArch64 has no distinct instructions for aligned load/store + INIT7_NAME( sad_aligned, sad, _neon ); + INIT7( sad_x3, _neon ); + INIT7( sad_x4, _neon ); + INIT7( ssd, _neon ); + INIT7( satd, _neon ); + INIT7( satd_x3, _neon ); + INIT7( satd_x4, _neon ); + INIT4( hadamard_ac, _neon ); + + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; + pixf->ssim_end4 = x264_pixel_ssim_end4_neon; + } +#endif // ARCH_AARCH64 + #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC )