From: David Conrad Date: Sun, 23 Aug 2009 08:35:10 +0000 (-0700) Subject: GSOC merge part 4: ARM NEON mc assembly functions X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=6bf21c631a0cf073ad0503e6f3a9eeabacc5078a;p=x264 GSOC merge part 4: ARM NEON mc assembly functions prefetch, memcpy_aligned, memzero_aligned, avg, mc_luma, get_ref, mc_chroma, hpel_filter, frame_init_lowres --- diff --git a/Makefile b/Makefile index 2b3d029f..2f3b09e3 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,8 @@ endif # NEON optims ifeq ($(ARCH),ARM) ifneq ($(AS),) -ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S +ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S +SRCS += common/arm/mc-c.c OBJASM = $(ASMSRC:%.S=%.o) endif endif diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S new file mode 100644 index 00000000..33b8dadf --- /dev/null +++ b/common/arm/mc-a.S @@ -0,0 +1,1044 @@ +/***************************************************************************** + * mc.S: h264 encoder + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * Mans Rullgard + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "asm.S" + +.fpu neon +.text + +// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8 +// They also use nothing above armv5te, but we don't care about pre-armv6 + +// void prefetch_ref( uint8_t *pix, int stride, int parity ) +function x264_prefetch_ref_arm, export=1 + sub r2, r2, #1 + add r0, r0, #64 + and r2, r2, r1 + add r0, r0, r2, lsl #3 + add r2, r1, r1, lsl #1 + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + add r3, r0, r1, lsl #2 + pld [r0, r2] + pld [r3] + pld [r3, r1] + pld [r3, r1, lsl #1] + pld [r3, r2] + bx lr +.endfunc + +// void prefetch_fenc( uint8_t *pix_y, int stride_y, +// uint8_t *pix_uv, int stride_uv, int mb_x ) +function x264_prefetch_fenc_arm, export=1 + ldr ip, [sp] + push {lr} + and lr, ip, #3 + smulbb lr, lr, r1 // note: this assumes stride_y is <= 16 bits signed + and ip, ip, #6 + smulbb ip, ip, r3 + add r0, r0, #64 + add r2, r2, #64 + add r0, r0, lr, lsl #2 + pld [r0] + add lr, r0, r1, lsl #1 + pld [r0, r1] + pld [lr] + add r2, r2, ip, lsl #2 + pld [lr, r1] + pld [r2] + add ip, r2, r3, lsl #1 + pld [r2, r3] + pld [ip] + pld [ip, r3] + pop {pc} +.endfunc + + +// void *x264_memcpy_aligned( void * dst, const void * src, size_t n ) +function x264_memcpy_aligned_neon, export=1 + orr r3, r0, r1, lsr #1 + movrel ip, memcpy_table + and r3, r3, #0xc + ldr pc, [ip, r3] +.endfunc + +.macro MEMCPY_ALIGNED srcalign dstalign +function memcpy_aligned_\dstalign\()_\srcalign\()_neon + mov r3, r0 +.if \srcalign == 8 && \dstalign == 8 + sub r2, #16 + vld1.64 {d0}, [r1,:64]! + vst1.64 {d0}, [r3,:64]! + .set r1align, 128 + .set r3align, 128 +.else + .set r1align, \srcalign * 8 + .set r3align, \dstalign * 8 +.endif + tst r2, #16 + beq 32f + sub r2, #16 + vld1.64 {d0-d1}, [r1,:r1align]! + vst1.64 {d0-d1}, [r3,:r3align]! +32: // n is a multiple of 32 + tst r2, #32 + beq 64f + sub r2, #32 + vld1.64 {d0-d3}, [r1,:r1align]! + vst1.64 {d0-d3}, [r3,:r3align]! +64: // n is a multiple of 64 + subs r2, #64 + vld1.64 {d0-d3}, [r1,:r1align]! + vld1.64 {d4-d7}, [r1,:r1align]! + vst1.64 {d0-d3}, [r3,:r3align]! + vst1.64 {d4-d7}, [r3,:r3align]! + bgt 64b +.if \srcalign == 8 && \dstalign == 8 + vld1.64 {d0}, [r1,:64]! + vst1.64 {d0}, [r3,:64]! +.endif + bx lr +.endfunc +.endm + +MEMCPY_ALIGNED 16, 16 +MEMCPY_ALIGNED 16, 8 +MEMCPY_ALIGNED 8, 16 +MEMCPY_ALIGNED 8, 8 + +.section .rodata +memcpy_table: +.word memcpy_aligned_16_16_neon +.word memcpy_aligned_16_8_neon +.word memcpy_aligned_8_16_neon +.word memcpy_aligned_8_8_neon +.text + + +// void x264_memzero_aligned( void *dst, size_t n ) +function x264_memzero_aligned_neon, export=1 + vmov.i8 q0, #0 + vmov.i8 q1, #0 +memzero_loop: + subs r1, #128 +.rept 4 + vst1.64 {d0-d3}, [r0,:128]! +.endr + bgt memzero_loop + bx lr +.endfunc + + +// void pixel_avg( uint8_t *dst, int dst_stride, +// uint8_t *src1, int src1_stride, +// uint8_t *src2, int src2_stride, int weight ); +.macro AVGH w h +function x264_pixel_avg_\w\()x\h\()_neon, export=1 + ldr ip, [sp, #8] + push {r4-r6,lr} + cmp ip, #32 + ldrd r4, [sp, #16] + mov lr, #\h + beq x264_pixel_avg_w\w\()_neon + rsbs r6, ip, #64 + blt x264_pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 + cmp ip, #0 + bge x264_pixel_avg_weight_w\w\()_add_add_neon + b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 +.endfunc +.endm + +AVGH 4, 2 +AVGH 4, 4 +AVGH 4, 8 +AVGH 8, 4 +AVGH 8, 8 +AVGH 8, 16 +AVGH 16, 8 +AVGH 16, 16 + +// 0 < weight < 64 +.macro load_weights_add_add + vdup.8 d30, ip + vdup.8 d31, r6 +.endm + +.macro load_add_add d1 d2 + vld1.32 {\d1}, [r2], r3 + vld1.32 {\d2}, [r4], r5 +.endm + +.macro weight_add_add dst s1 s2 + vmull.u8 \dst, \s1, d30 + vmlal.u8 \dst, \s2, d31 +.endm + +// weight > 64 +.macro load_weights_add_sub + rsb r6, #0 + vdup.8 d30, ip + vdup.8 d31, r6 +.endm + +.macro load_add_sub d1 d2 + vld1.32 {\d1}, [r2], r3 + vld1.32 {\d2}, [r4], r5 +.endm + +.macro weight_add_sub dst s1 s2 + vmull.u8 \dst, \s1, d30 + vmlsl.u8 \dst, \s2, d31 +.endm + +// weight < 0 +.macro load_weights_sub_add + rsb ip, #0 + vdup.8 d31, r6 + vdup.8 d30, ip +.endm + +.macro load_sub_add d1 d2 + vld1.32 {\d2}, [r4], r5 + vld1.32 {\d1}, [r2], r3 +.endm + +.macro weight_sub_add dst s1 s2 + vmull.u8 \dst, \s2, d31 + vmlsl.u8 \dst, \s1, d30 +.endm + +.macro AVG_WEIGHT ext +function x264_pixel_avg_weight_w4_\ext\()_neon, export=1 + load_weights_\ext +1: // height loop + subs lr, lr, #2 + load_\ext d0[], d1[] + weight_\ext q8, d0, d1 + load_\ext d2[], d3[] + vqrshrun.s16 d0, q8, #6 + weight_\ext q9, d2, d3 + vst1.32 {d0[0]}, [r0,:32], r1 + vqrshrun.s16 d1, q9, #6 + vst1.32 {d1[0]}, [r0,:32], r1 + bgt 1b + pop {r4-r6,pc} +.endfunc + +function x264_pixel_avg_weight_w8_\ext\()_neon, export=1 + load_weights_\ext +1: // height loop + subs lr, lr, #4 + load_\ext d0, d1 + weight_\ext q8, d0, d1 + load_\ext d2, d3 + weight_\ext q9, d2, d3 + load_\ext d4, d5 + weight_\ext q10, d4, d5 + load_\ext d6, d7 + weight_\ext q11, d6, d7 + vqrshrun.s16 d0, q8, #6 + vqrshrun.s16 d1, q9, #6 + vqrshrun.s16 d2, q10, #6 + vqrshrun.s16 d3, q11, #6 + vst1.64 {d0}, [r0,:64], r1 + vst1.64 {d1}, [r0,:64], r1 + vst1.64 {d2}, [r0,:64], r1 + vst1.64 {d3}, [r0,:64], r1 + bgt 1b + pop {r4-r6,pc} +.endfunc + +function x264_pixel_avg_weight_w16_\ext\()_neon, export=1 + load_weights_\ext +1: // height loop + subs lr, lr, #2 + load_\ext d0-d1, d2-d3 + weight_\ext q8, d0, d2 + weight_\ext q9, d1, d3 + load_\ext d4-d5, d6-d7 + weight_\ext q10, d4, d6 + weight_\ext q11, d5, d7 + vqrshrun.s16 d0, q8, #6 + vqrshrun.s16 d1, q9, #6 + vqrshrun.s16 d2, q10, #6 + vqrshrun.s16 d3, q11, #6 + vst1.64 {d0-d1}, [r0,:128], r1 + vst1.64 {d2-d3}, [r0,:128], r1 + bgt 1b + pop {r4-r6,pc} +.endfunc +.endm + +AVG_WEIGHT add_add +AVG_WEIGHT add_sub +AVG_WEIGHT sub_add + +function x264_pixel_avg_w4_neon, export=1 + subs lr, lr, #2 + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d2[]}, [r4], r5 + vrhadd.u8 d0, d0, d2 + vld1.32 {d1[]}, [r2], r3 + vld1.32 {d3[]}, [r4], r5 + vrhadd.u8 d1, d1, d3 + vst1.32 {d0[0]}, [r0,:32], r1 + vst1.32 {d1[0]}, [r0,:32], r1 + bgt x264_pixel_avg_w4_neon + pop {r4-r6,pc} +.endfunc + +function x264_pixel_avg_w8_neon, export=1 + subs lr, lr, #4 + vld1.64 {d0}, [r2], r3 + vld1.64 {d2}, [r4], r5 + vrhadd.u8 d0, d0, d2 + vld1.64 {d1}, [r2], r3 + vld1.64 {d3}, [r4], r5 + vrhadd.u8 d1, d1, d3 + vst1.64 {d0}, [r0,:64], r1 + vld1.64 {d2}, [r2], r3 + vld1.64 {d4}, [r4], r5 + vrhadd.u8 d2, d2, d4 + vst1.64 {d1}, [r0,:64], r1 + vld1.64 {d3}, [r2], r3 + vld1.64 {d5}, [r4], r5 + vrhadd.u8 d3, d3, d5 + vst1.64 {d2}, [r0,:64], r1 + vst1.64 {d3}, [r0,:64], r1 + bgt x264_pixel_avg_w8_neon + pop {r4-r6,pc} +.endfunc + +function x264_pixel_avg_w16_neon, export=1 + subs lr, lr, #4 + vld1.64 {d0-d1}, [r2], r3 + vld1.64 {d2-d3}, [r4], r5 + vrhadd.u8 q0, q0, q1 + vld1.64 {d2-d3}, [r2], r3 + vld1.64 {d4-d5}, [r4], r5 + vrhadd.u8 q1, q1, q2 + vst1.64 {d0-d1}, [r0,:128], r1 + vld1.64 {d4-d5}, [r2], r3 + vld1.64 {d6-d7}, [r4], r5 + vrhadd.u8 q2, q2, q3 + vst1.64 {d2-d3}, [r0,:128], r1 + vld1.64 {d6-d7}, [r2], r3 + vld1.64 {d0-d1}, [r4], r5 + vrhadd.u8 q3, q3, q0 + vst1.64 {d4-d5}, [r0,:128], r1 + vst1.64 {d6-d7}, [r0,:128], r1 + bgt x264_pixel_avg_w16_neon + pop {r4-r6,pc} +.endfunc + + +function x264_pixel_avg2_w4_neon, export=1 + ldr ip, [sp, #4] + push {lr} + ldr lr, [sp, #4] +avg2_w4_loop: + subs ip, ip, #2 + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d2[]}, [lr], r3 + vrhadd.u8 d0, d0, d2 + vld1.32 {d1[]}, [r2], r3 + vld1.32 {d3[]}, [lr], r3 + vrhadd.u8 d1, d1, d3 + vst1.32 {d0[0]}, [r0,:32], r1 + vst1.32 {d1[0]}, [r0,:32], r1 + bgt avg2_w4_loop + pop {pc} +.endfunc + +function x264_pixel_avg2_w8_neon, export=1 + ldr ip, [sp, #4] + push {lr} + ldr lr, [sp, #4] +avg2_w8_loop: + subs ip, ip, #2 + vld1.64 {d0}, [r2], r3 + vld1.64 {d2}, [lr], r3 + vrhadd.u8 d0, d0, d2 + vld1.64 {d1}, [r2], r3 + vld1.64 {d3}, [lr], r3 + vrhadd.u8 d1, d1, d3 + vst1.64 {d0}, [r0,:64], r1 + vst1.64 {d1}, [r0,:64], r1 + bgt avg2_w8_loop + pop {pc} +.endfunc + +function x264_pixel_avg2_w16_neon, export=1 + ldr ip, [sp, #4] + push {lr} + ldr lr, [sp, #4] +avg2_w16_loop: + subs ip, ip, #2 + vld1.64 {d0-d1}, [r2], r3 + vld1.64 {d2-d3}, [lr], r3 + vrhadd.u8 q0, q0, q1 + vld1.64 {d4-d5}, [r2], r3 + vld1.64 {d6-d7}, [lr], r3 + vrhadd.u8 q2, q2, q3 + vst1.64 {d0-d1}, [r0,:128], r1 + vst1.64 {d4-d5}, [r0,:128], r1 + bgt avg2_w16_loop + pop {pc} +.endfunc + +function x264_pixel_avg2_w20_neon, export=1 + ldr ip, [sp, #4] + push {lr} + sub r1, r1, #16 + ldr lr, [sp, #4] +avg2_w20_loop: + subs ip, ip, #2 + vld1.64 {d0-d2}, [r2], r3 + vld1.64 {d4-d6}, [lr], r3 + vrhadd.u8 q0, q0, q2 + vrhadd.u8 d2, d2, d6 + vld1.64 {d4-d6}, [r2], r3 + vld1.64 {d16-d18},[lr], r3 + vrhadd.u8 q2, q2, q8 + vst1.64 {d0-d1}, [r0,:128]! + vrhadd.u8 d6, d6, d18 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.64 {d4-d5}, [r0,:128]! + vst1.32 {d6[0]}, [r0,:32], r1 + bgt avg2_w20_loop + pop {pc} +.endfunc + + +// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height ) +function x264_mc_copy_w4_neon, export=1 + ldr ip, [sp] +copy_w4_loop: + subs ip, ip, #4 + vld1.32 {d0[]}, [r2], r3 + vld1.32 {d1[]}, [r2], r3 + vld1.32 {d2[]}, [r2], r3 + vld1.32 {d3[]}, [r2], r3 + vst1.32 {d0[0]}, [r0,:32], r1 + vst1.32 {d1[0]}, [r0,:32], r1 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + bgt copy_w4_loop + bx lr +.endfunc + +function x264_mc_copy_w8_neon, export=1 + ldr ip, [sp] +copy_w8_loop: + subs ip, ip, #4 + vld1.32 {d0}, [r2], r3 + vld1.32 {d1}, [r2], r3 + vld1.32 {d2}, [r2], r3 + vld1.32 {d3}, [r2], r3 + vst1.32 {d0}, [r0,:64], r1 + vst1.32 {d1}, [r0,:64], r1 + vst1.32 {d2}, [r0,:64], r1 + vst1.32 {d3}, [r0,:64], r1 + bgt copy_w8_loop + bx lr +.endfunc + +function x264_mc_copy_w16_neon, export=1 + ldr ip, [sp] +copy_w16_loop: + subs ip, ip, #4 + vld1.32 {d0-d1}, [r2], r3 + vld1.32 {d2-d3}, [r2], r3 + vld1.32 {d4-d5}, [r2], r3 + vld1.32 {d6-d7}, [r2], r3 + vst1.32 {d0-d1}, [r0,:128], r1 + vst1.32 {d2-d3}, [r0,:128], r1 + vst1.32 {d4-d5}, [r0,:128], r1 + vst1.32 {d6-d7}, [r0,:128], r1 + bgt copy_w16_loop + bx lr +.endfunc + +function x264_mc_copy_w16_aligned_neon, export=1 + ldr ip, [sp] +copy_w16_aligned_loop: + subs ip, ip, #4 + vld1.32 {d0-d1}, [r2,:128], r3 + vld1.32 {d2-d3}, [r2,:128], r3 + vld1.32 {d4-d5}, [r2,:128], r3 + vld1.32 {d6-d7}, [r2,:128], r3 + vst1.32 {d0-d1}, [r0,:128], r1 + vst1.32 {d2-d3}, [r0,:128], r1 + vst1.32 {d4-d5}, [r0,:128], r1 + vst1.32 {d6-d7}, [r0,:128], r1 + bgt copy_w16_aligned_loop + bx lr +.endfunc + + +// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride, +// uint8_t *src, int i_src_stride, +// int dx, int dy, int i_width, int i_height ); +function x264_mc_chroma_neon, export=1 + push {r4-r6, lr} + ldrd r4, [sp, #16] + ldr r6, [sp, #24] + + asr lr, r5, #3 + mul lr, r3, lr + add r2, r2, r4, asr #3 + cmp r6, #4 + add r2, r2, lr + + and r4, r4, #7 + and r5, r5, #7 + pld [r2] + pld [r2, r3] + + bgt mc_chroma_w8 + beq mc_chroma_w4 + +// calculate cA cB cC cD +.macro CHROMA_MC_START r0 r1 + muls lr, r4, r5 + rsb r6, lr, r5, lsl #3 + rsb ip, lr, r4, lsl #3 + sub r4, lr, r4, lsl #3 + sub r4, r4, r5, lsl #3 + add r4, r4, #64 + + beq 2f + + add r5, r2, r3 + + vdup.8 d0, r4 + lsl r3, r3, #1 + vdup.8 d1, ip + vld1.64 {\r0}, [r2], r3 + vdup.8 d2, r6 + vld1.64 {\r1}, [r5], r3 + vdup.8 d3, lr + ldr r4, [sp, #28] + + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 +.endm + +.macro CHROMA_MC width, align +mc_chroma_w\width: + CHROMA_MC_START d4, d6 +// since the element size varies, there's a different index for the 2nd store +.if \width == 4 + .set st2, 1 +.else + .set st2, 2 +.endif + + vtrn.32 d4, d5 + vtrn.32 d6, d7 + + vtrn.32 d0, d1 + vtrn.32 d2, d3 + +1: // height loop, interpolate xy + pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d2 + vld1.64 {d4}, [r2], r3 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d2 + vld1.64 {d6}, [r5], r3 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + vrshrn.u16 d16, q8, #6 + subs r4, r4, #2 + pld [r2] + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + vst1.\align {d16[0]}, [r0,:\align], r1 + vst1.\align {d16[st2]}, [r0,:\align], r1 + bgt 1b + + pop {r4-r6, pc} + +2: // dx or dy are 0 + tst r6, r6 + add ip, ip, r6 + vdup.8 d0, r4 + vdup.8 d1, ip + vtrn.32 d0, d1 + ldr r4, [sp, #28] + + beq 4f + + vext.32 d1, d0, d1, #1 + add r5, r2, r3 + lsl r3, r3, #1 + vld1.32 {d4[0]}, [r2], r3 + vld1.32 {d4[1]}, [r5], r3 + +3: // vertical interpolation loop + pld [r5] + vmull.u8 q8, d4, d0 + vld1.32 {d4[0]}, [r2], r3 + vmull.u8 q9, d4, d1 + vld1.32 {d4[1]}, [r5], r3 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + vrshrn.u16 d16, q8, #6 + subs r4, r4, #2 + pld [r2] + vst1.\align {d16[0]}, [r0,:\align], r1 + vst1.\align {d16[st2]}, [r0,:\align], r1 + bgt 3b + + pop {r4-r6, pc} + +4: // dy is 0 + vld1.64 {d4}, [r2], r3 + vld1.64 {d6}, [r2], r3 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vtrn.32 d4, d5 + vtrn.32 d6, d7 + +5: // horizontal interpolation loop + vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + subs r4, r4, #2 + vld1.64 {d4}, [r2], r3 + vext.8 d5, d4, d5, #1 + vtrn.32 d4, d5 + vadd.i16 d16, d16, d17 + vadd.i16 d17, d18, d19 + pld [r2] + vrshrn.u16 d16, q8, #6 + vld1.64 {d6}, [r2], r3 + vext.8 d7, d6, d7, #1 + vtrn.32 d6, d7 + pld [r2] + vst1.\align {d16[0]}, [r0,:\align], r1 + vst1.\align {d16[st2]}, [r0,:\align], r1 + bgt 5b + + pop {r4-r6, pc} +.endm + + CHROMA_MC 2, 16 + CHROMA_MC 4, 32 + +// the optimial timing for width 8 is different enough that it's not +// readable to put it in the same macro as width 2/4 +mc_chroma_w8: + CHROMA_MC_START d4-d5, d6-d7 + +1: // height loop, interpolate xy + pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vld1.64 {d4, d5}, [r2], r3 + vmlal.u8 q8, d6, d2 + vext.8 d5, d4, d5, #1 + vmlal.u8 q8, d7, d3 + vmull.u8 q9, d6, d0 + subs r4, r4, #2 + vmlal.u8 q9, d7, d1 + vmlal.u8 q9, d4, d2 + vmlal.u8 q9, d5, d3 + vrshrn.u16 d16, q8, #6 + vld1.64 {d6, d7}, [r5], r3 + pld [r2] + vrshrn.u16 d17, q9, #6 + vext.8 d7, d6, d7, #1 + vst1.64 {d16}, [r0,:64], r1 + vst1.64 {d17}, [r0,:64], r1 + bgt 1b + + pop {r4-r6, pc} + +2: // dx or dy are 0 + tst r6, r6 + add ip, ip, r6 + vdup.8 d0, r4 + vdup.8 d1, ip + ldr r4, [sp, #28] + + beq 4f + + add r5, r2, r3 + lsl r3, r3, #1 + vld1.64 {d4}, [r2], r3 + vld1.64 {d6}, [r5], r3 + +3: // vertical interpolation loop + pld [r5] + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d6, d1 + vld1.64 {d4}, [r2], r3 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d4, d1 + vld1.64 {d6}, [r5], r3 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 + subs r4, r4, #2 + pld [r2] + vst1.64 {d16}, [r0,:64], r1 + vst1.64 {d17}, [r0,:64], r1 + bgt 3b + + pop {r4-r6, pc} + +4: // dy is 0 + vld1.64 {d4, d5}, [r2], r3 + vld1.64 {d6, d7}, [r2], r3 + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + +5: // horizontal interpolation loop + pld [r2] + subs r4, r4, #2 + vmull.u8 q8, d4, d0 + vmlal.u8 q8, d5, d1 + vld1.64 {d4, d5}, [r2], r3 + vmull.u8 q9, d6, d0 + vmlal.u8 q9, d7, d1 + pld [r2] + vext.8 d5, d4, d5, #1 + vrshrn.u16 d16, q8, #6 + vrshrn.u16 d17, q9, #6 + vld1.64 {d6, d7}, [r2], r3 + vext.8 d7, d6, d7, #1 + vst1.64 {d16}, [r0,:64], r1 + vst1.64 {d17}, [r0,:64], r1 + bgt 5b + + pop {r4-r6, pc} +.endfunc + + +// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width) +function x264_hpel_filter_v_neon, export=1 + ldr ip, [sp] + sub r1, r1, r3, lsl #1 + push {lr} + add lr, r1, ip + vmov.u8 d30, #5 + vmov.u8 d31, #20 + +filter_v_loop: + subs ip, ip, #16 + vld1.64 {d0-d1}, [r1,:128], r3 + vld1.64 {d2-d3}, [r1,:128], r3 + vld1.64 {d4-d5}, [r1,:128], r3 + vld1.64 {d6-d7}, [r1,:128], r3 + vld1.64 {d16-d17}, [r1,:128], r3 + vld1.64 {d18-d19}, [r1,:128], r3 + sub r1, lr, ip + + vaddl.u8 q10, d0, d18 + vmlsl.u8 q10, d2, d30 + vmlal.u8 q10, d4, d31 + vmlal.u8 q10, d6, d31 + vmlsl.u8 q10, d16, d30 + + vaddl.u8 q11, d1, d19 + vmlsl.u8 q11, d3, d30 + vmlal.u8 q11, d5, d31 + vmlal.u8 q11, d7, d31 + vmlsl.u8 q11, d17, d30 + + vqrshrun.s16 d0, q10, #5 + vst1.64 {d20-d21}, [r2,:128]! + vqrshrun.s16 d1, q11, #5 + vst1.64 {d22-d23}, [r2,:128]! + vst1.64 {d0-d1}, [r0,:128]! + bgt filter_v_loop + pop {pc} +.endfunc + +// hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); +function x264_hpel_filter_c_neon, export=1 + sub r1, #16 + vld1.64 {d0-d3}, [r1,:128]! + + // unrolled 2x: 4% faster +filter_c_loop: + subs r2, r2, #16 + vld1.64 {d4-d7}, [r1,:128]! + vext.16 q8, q0, q1, #6 + vext.16 q12, q1, q2, #3 + vadd.s16 q8, q8, q12 + vext.16 q9, q0, q1, #7 + vext.16 q11, q1, q2, #2 + vadd.s16 q9, q9, q11 + vext.16 q10, q1, q2, #1 + vext.16 q11, q1, q2, #6 + vadd.s16 q10, q1, q10 + vsub.s16 q8, q8, q9 // a-b + vext.16 q15, q2, q3, #3 + vsub.s16 q9, q9, q10 // b-c + + vext.16 q12, q1, q2, #7 + vshr.s16 q8, q8, #2 // (a-b)/4 + vadd.s16 q11, q11, q15 + vext.16 q14, q2, q3, #2 + vsub.s16 q8, q8, q9 // (a-b)/4-b+c + vadd.s16 q12, q12, q14 + vext.16 q13, q2, q3, #1 + + vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4 + vadd.s16 q13, q2, q13 + vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vsub.s16 q11, q11, q12 // a-b + vsub.s16 q12, q12, q13 // b-c + vshr.s16 q11, q11, #2 // (a-b)/4 + vqrshrun.s16 d30, q8, #6 + vsub.s16 q11, q11, q12 // (a-b)/4-b+c + vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4 + vld1.64 {d0-d3}, [r1,:128]! + vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + + vext.16 q8, q2, q3, #6 + vqrshrun.s16 d31, q11, #6 + vext.16 q12, q3, q0, #3 + vadd.s16 q8, q8, q12 + vext.16 q9, q2, q3, #7 + vst1.64 {d30-d31}, [r0,:128]! + bxle lr + subs r2, r2, #16 + + vext.16 q11, q3, q0, #2 + vadd.s16 q9, q9, q11 + vext.16 q10, q3, q0, #1 + vext.16 q11, q3, q0, #6 + vadd.s16 q10, q3, q10 + vsub.s16 q8, q8, q9 // a-b + vext.16 q15, q0, q1, #3 + vsub.s16 q9, q9, q10 // b-c + + vext.16 q12, q3, q0, #7 + vshr.s16 q8, q8, #2 // (a-b)/4 + vadd.s16 q11, q11, q15 + vext.16 q14, q0, q1, #2 + vsub.s16 q8, q8, q9 // (a-b)/4-b+c + vadd.s16 q12, q12, q14 + vext.16 q13, q0, q1, #1 + + vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4 + vadd.s16 q13, q0, q13 + vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + vsub.s16 q11, q11, q12 // a-b + vsub.s16 q12, q12, q13 // b-c + vshr.s16 q11, q11, #2 // (a-b)/4 + vqrshrun.s16 d30, q8, #6 + vsub.s16 q11, q11, q12 // (a-b)/4-b+c + vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4 + vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 + + vqrshrun.s16 d31, q11, #6 + vst1.64 {d30-d31}, [r0,:128]! + bgt filter_c_loop + bx lr +.endfunc + +// hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); +function x264_hpel_filter_h_neon, export=1 + sub r1, #16 + vmov.u8 d30, #5 + vld1.64 {d0-d3}, [r1,:128]! + vmov.u8 d31, #20 + + // unrolled 3x because it's 5% faster, due to mitigating + // the high latency of multiplication and vqrshrun +filter_h_loop: + subs r2, r2, #16 + vld1.64 {d4-d5}, [r1,:128]! + vext.8 q8, q0, q1, #14 + vext.8 q12, q1, q2, #3 + vaddl.u8 q13, d16, d24 + vext.8 q9, q0, q1, #15 + vaddl.u8 q14, d17, d25 + + vext.8 q10, q1, q2, #1 + vmlal.u8 q13, d2, d31 + vmlsl.u8 q13, d18, d30 + vext.8 q11, q1, q2, #2 + vmlal.u8 q13, d20, d31 + vmlsl.u8 q13, d22, d30 + + vmlsl.u8 q14, d19, d30 + vmlal.u8 q14, d3, d31 + vmlal.u8 q14, d21, d31 + vmlsl.u8 q14, d23, d30 + vqrshrun.s16 d6, q13, #5 + + vld1.64 {d0-d1}, [r1,:128]! + vext.8 q8, q1, q2, #14 + vext.8 q12, q2, q0, #3 + vaddl.u8 q13, d16, d24 + vqrshrun.s16 d7, q14, #5 + vext.8 q9, q1, q2, #15 + vaddl.u8 q14, d17, d25 + + vst1.64 {d6-d7}, [r0,:128]! + bxle lr + subs r2, r2, #16 + + vext.8 q10, q2, q0, #1 + vmlal.u8 q13, d4, d31 + vmlsl.u8 q13, d18, d30 + vext.8 q11, q2, q0, #2 + vmlal.u8 q13, d20, d31 + vmlsl.u8 q13, d22, d30 + + vmlsl.u8 q14, d19, d30 + vmlal.u8 q14, d5, d31 + vmlal.u8 q14, d21, d31 + vmlsl.u8 q14, d23, d30 + vqrshrun.s16 d6, q13, #5 + + vld1.64 {d2-d3}, [r1,:128]! + vext.8 q8, q2, q0, #14 + vext.8 q12, q0, q1, #3 + vaddl.u8 q13, d16, d24 + vqrshrun.s16 d7, q14, #5 + vext.8 q9, q2, q0, #15 + vaddl.u8 q14, d17, d25 + + vst1.64 {d6-d7}, [r0,:128]! + bxle lr + subs r2, r2, #16 + + vext.8 q10, q0, q1, #1 + vmlal.u8 q13, d0, d31 + vmlsl.u8 q13, d18, d30 + vext.8 q11, q0, q1, #2 + vmlal.u8 q13, d20, d31 + vmlsl.u8 q13, d22, d30 + + vmlsl.u8 q14, d19, d30 + vmlal.u8 q14, d1, d31 + vmlal.u8 q14, d21, d31 + vmlsl.u8 q14, d23, d30 + + vqrshrun.s16 d6, q13, #5 + vqrshrun.s16 d7, q14, #5 + vst1.64 {d6-d7}, [r0,:128]! + bgt filter_h_loop + bx lr +.endfunc + + +// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, +// uint8_t *dstc, int src_stride, int dst_stride, int width, +// int height ) +function x264_frame_init_lowres_core_neon, export=1 + push {r4-r10,lr} + vpush {d8-d15} + ldrd r4, [sp, #96] + ldrd r6, [sp, #104] + ldr lr, [sp, #112] + sub r10, r6, r7 // dst_stride - width + and r10, r10, #~15 + +lowres_yloop: + mov ip, r7 // width + mov r6, r0 // src0 + add r8, r0, r5 // src1 = src0 + src_stride + add r9, r0, r5, lsl #1 // src2 = src1 + src_stride + + vld2.8 {d8, d10}, [r6,:128]! + vld2.8 {d12,d14}, [r8,:128]! + vld2.8 {d16,d18}, [r9,:128]! + +lowres_xloop: + subs ip, ip, #16 + + vld2.8 {d9, d11}, [r6,:128]! + vld2.8 {d13,d15}, [r8,:128]! + vrhadd.u8 q0, q4, q6 + vld2.8 {d17,d19}, [r9,:128]! + vrhadd.u8 q5, q5, q7 + vld2.8 {d20,d22}, [r6,:128]! + vrhadd.u8 q1, q6, q8 + vld2.8 {d24,d26}, [r8,:128]! + vrhadd.u8 q7, q7, q9 + vext.8 q4, q4, q10, #1 + vrhadd.u8 q0, q0, q5 + vext.8 q6, q6, q12, #1 + vrhadd.u8 q1, q1, q7 + vld2.8 {d28,d30}, [r9,:128]! + vrhadd.u8 q4, q4, q6 + vext.8 q8, q8, q14, #1 + vrhadd.u8 q6, q6, q8 + vst1.64 {d0-d1}, [r1,:128]! + vrhadd.u8 q2, q4, q5 + vst1.64 {d2-d3}, [r3,:128]! + vrhadd.u8 q3, q6, q7 + vst1.64 {d4-d5}, [r2,:128]! + vst1.64 {d6-d7}, [r4,:128]! + + ble lowres_xloop_end + subs ip, ip, #16 + + vld2.8 {d21,d23}, [r6,:128]! + vld2.8 {d25,d27}, [r8,:128]! + vrhadd.u8 q0, q10, q12 + vld2.8 {d29,d31}, [r9,:128]! + vrhadd.u8 q11, q11, q13 + vld2.8 {d8, d10}, [r6,:128]! + vrhadd.u8 q1, q12, q14 + vld2.8 {d12,d14}, [r8,:128]! + vrhadd.u8 q13, q13, q15 + vext.8 q10, q10, q4, #1 + vrhadd.u8 q0, q0, q11 + vext.8 q12, q12, q6, #1 + vrhadd.u8 q1, q1, q13 + vld2.8 {d16,d18}, [r9,:128]! + vrhadd.u8 q10, q10, q12 + vext.8 q14, q14, q8, #1 + vrhadd.u8 q12, q12, q14 + vst1.64 {d0-d1}, [r1,:128]! + vrhadd.u8 q2, q10, q11 + vst1.64 {d2-d3}, [r3,:128]! + vrhadd.u8 q3, q12, q13 + vst1.64 {d4-d5}, [r2,:128]! + vst1.64 {d6-d7}, [r4,:128]! + + bgt lowres_xloop + +lowres_xloop_end: + subs lr, lr, #1 + add r0, r0, r5, lsl #1 + add r1, r1, r10 + add r2, r2, r10 + add r3, r3, r10 + add r4, r4, r10 + bgt lowres_yloop + + vpop {d8-d15} + pop {r4-r10,pc} +.endfunc diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c new file mode 100644 index 00000000..201dc6b0 --- /dev/null +++ b/common/arm/mc-c.c @@ -0,0 +1,185 @@ +/***************************************************************************** + * mc-c.c: h264 encoder library (Motion Compensation) + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#include "common/common.h" +#include "mc.h" + +void x264_prefetch_ref_arm( uint8_t *, int, int ); +void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int ); + +void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n ); +void x264_memzero_aligned_neon( void *dst, size_t n ); + +void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); +void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ); + +void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); +void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int ); + +void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int ); +void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int ); + +void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int ); +void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int); + +static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_neon, + x264_pixel_avg2_w8_neon, + x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function + x264_pixel_avg2_w16_neon, + x264_pixel_avg2_w20_neon, +}; + +static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) = +{ + NULL, + x264_mc_copy_w4_neon, + x264_mc_copy_w8_neon, + NULL, + x264_mc_copy_w16_neon, +}; + +static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; + +static void mc_luma_neon( uint8_t *dst, int i_dst_stride, + uint8_t *src[4], int i_src_stride, + int mvx, int mvy, + int i_width, int i_height ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + int offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, i_dst_stride, src1, i_src_stride, + src2, i_height ); + } + else + { + x264_mc_copy_wtab_neon[i_width>>2]( + dst, i_dst_stride, src1, i_src_stride, i_height ); + } +} + +static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride, + uint8_t *src[4], int i_src_stride, + int mvx, int mvy, + int i_width, int i_height ) +{ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + int offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + if ( (mvy&3) == 3 ) // explict if() to force conditional add + src1 += i_src_stride; + + if( qpel_idx & 5 ) /* qpel interpolation needed */ + { + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + x264_pixel_avg_wtab_neon[i_width>>2]( + dst, *i_dst_stride, src1, i_src_stride, + src2, i_height ); + return dst; + } + else + { + *i_dst_stride = i_src_stride; + return src1; + } +} + +void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int ); +void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int ); +void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int ); + +static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + int stride, int width, int height, int16_t *buf ) +{ + int realign = (intptr_t)src & 15; + src -= realign; + dstv -= realign; + dstc -= realign; + dsth -= realign; + width += realign; + while( height-- ) + { + x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width ); + x264_hpel_filter_c_neon( dstc, buf+8, width ); + x264_hpel_filter_h_neon( dsth, src, width ); + dsth += stride; + dstv += stride; + dstc += stride; + src += stride; + } +} + +void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) +{ + if( !(cpu&X264_CPU_ARMV6) ) + return; + + pf->prefetch_fenc = x264_prefetch_fenc_arm; + pf->prefetch_ref = x264_prefetch_ref_arm; + + if( !(cpu&X264_CPU_NEON) ) + return; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; + + pf->memcpy_aligned = x264_memcpy_aligned_neon; + pf->memzero_aligned = x264_memzero_aligned_neon; + + pf->mc_chroma = x264_mc_chroma_neon; + pf->mc_luma = mc_luma_neon; + pf->get_ref = get_ref_neon; + pf->hpel_filter = hpel_filter_neon; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; +} diff --git a/common/arm/mc.h b/common/arm/mc.h new file mode 100644 index 00000000..6ee510ef --- /dev/null +++ b/common/arm/mc.h @@ -0,0 +1,28 @@ +/***************************************************************************** + * mc.h: h264 encoder library (Motion Compensation) + ***************************************************************************** + * Copyright (C) 2009 x264 project + * + * Authors: David Conrad + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + *****************************************************************************/ + +#ifndef X264_ARM_MC_H +#define X264_ARM_MC_H + +void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ); + +#endif diff --git a/common/mc.c b/common/mc.c index ee769a01..41cf5938 100644 --- a/common/mc.c +++ b/common/mc.c @@ -29,6 +29,9 @@ #ifdef ARCH_PPC #include "ppc/mc.h" #endif +#ifdef ARCH_ARM +#include "arm/mc.h" +#endif static inline void pixel_avg( uint8_t *dst, int i_dst_stride, @@ -428,6 +431,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_ALTIVEC ) x264_mc_altivec_init( pf ); #endif +#ifdef HAVE_ARMV6 + x264_mc_init_arm( cpu, pf ); +#endif } void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )