--- /dev/null
+/*****************************************************************************
+ * mc.S: h264 encoder
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ * Mans Rullgard <mans@mansr.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.fpu neon
+.text
+
+// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
+// They also use nothing above armv5te, but we don't care about pre-armv6
+
+// void prefetch_ref( uint8_t *pix, int stride, int parity )
+function x264_prefetch_ref_arm, export=1
+ sub r2, r2, #1
+ add r0, r0, #64
+ and r2, r2, r1
+ add r0, r0, r2, lsl #3
+ add r2, r1, r1, lsl #1
+ pld [r0]
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+ add r3, r0, r1, lsl #2
+ pld [r0, r2]
+ pld [r3]
+ pld [r3, r1]
+ pld [r3, r1, lsl #1]
+ pld [r3, r2]
+ bx lr
+.endfunc
+
+// void prefetch_fenc( uint8_t *pix_y, int stride_y,
+// uint8_t *pix_uv, int stride_uv, int mb_x )
+function x264_prefetch_fenc_arm, export=1
+ ldr ip, [sp]
+ push {lr}
+ and lr, ip, #3
+ smulbb lr, lr, r1 // note: this assumes stride_y is <= 16 bits signed
+ and ip, ip, #6
+ smulbb ip, ip, r3
+ add r0, r0, #64
+ add r2, r2, #64
+ add r0, r0, lr, lsl #2
+ pld [r0]
+ add lr, r0, r1, lsl #1
+ pld [r0, r1]
+ pld [lr]
+ add r2, r2, ip, lsl #2
+ pld [lr, r1]
+ pld [r2]
+ add ip, r2, r3, lsl #1
+ pld [r2, r3]
+ pld [ip]
+ pld [ip, r3]
+ pop {pc}
+.endfunc
+
+
+// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
+function x264_memcpy_aligned_neon, export=1
+ orr r3, r0, r1, lsr #1
+ movrel ip, memcpy_table
+ and r3, r3, #0xc
+ ldr pc, [ip, r3]
+.endfunc
+
+.macro MEMCPY_ALIGNED srcalign dstalign
+function memcpy_aligned_\dstalign\()_\srcalign\()_neon
+ mov r3, r0
+.if \srcalign == 8 && \dstalign == 8
+ sub r2, #16
+ vld1.64 {d0}, [r1,:64]!
+ vst1.64 {d0}, [r3,:64]!
+ .set r1align, 128
+ .set r3align, 128
+.else
+ .set r1align, \srcalign * 8
+ .set r3align, \dstalign * 8
+.endif
+ tst r2, #16
+ beq 32f
+ sub r2, #16
+ vld1.64 {d0-d1}, [r1,:r1align]!
+ vst1.64 {d0-d1}, [r3,:r3align]!
+32: // n is a multiple of 32
+ tst r2, #32
+ beq 64f
+ sub r2, #32
+ vld1.64 {d0-d3}, [r1,:r1align]!
+ vst1.64 {d0-d3}, [r3,:r3align]!
+64: // n is a multiple of 64
+ subs r2, #64
+ vld1.64 {d0-d3}, [r1,:r1align]!
+ vld1.64 {d4-d7}, [r1,:r1align]!
+ vst1.64 {d0-d3}, [r3,:r3align]!
+ vst1.64 {d4-d7}, [r3,:r3align]!
+ bgt 64b
+.if \srcalign == 8 && \dstalign == 8
+ vld1.64 {d0}, [r1,:64]!
+ vst1.64 {d0}, [r3,:64]!
+.endif
+ bx lr
+.endfunc
+.endm
+
+MEMCPY_ALIGNED 16, 16
+MEMCPY_ALIGNED 16, 8
+MEMCPY_ALIGNED 8, 16
+MEMCPY_ALIGNED 8, 8
+
+.section .rodata
+memcpy_table:
+.word memcpy_aligned_16_16_neon
+.word memcpy_aligned_16_8_neon
+.word memcpy_aligned_8_16_neon
+.word memcpy_aligned_8_8_neon
+.text
+
+
+// void x264_memzero_aligned( void *dst, size_t n )
+function x264_memzero_aligned_neon, export=1
+ vmov.i8 q0, #0
+ vmov.i8 q1, #0
+memzero_loop:
+ subs r1, #128
+.rept 4
+ vst1.64 {d0-d3}, [r0,:128]!
+.endr
+ bgt memzero_loop
+ bx lr
+.endfunc
+
+
+// void pixel_avg( uint8_t *dst, int dst_stride,
+// uint8_t *src1, int src1_stride,
+// uint8_t *src2, int src2_stride, int weight );
+.macro AVGH w h
+function x264_pixel_avg_\w\()x\h\()_neon, export=1
+ ldr ip, [sp, #8]
+ push {r4-r6,lr}
+ cmp ip, #32
+ ldrd r4, [sp, #16]
+ mov lr, #\h
+ beq x264_pixel_avg_w\w\()_neon
+ rsbs r6, ip, #64
+ blt x264_pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
+ cmp ip, #0
+ bge x264_pixel_avg_weight_w\w\()_add_add_neon
+ b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
+.endfunc
+.endm
+
+AVGH 4, 2
+AVGH 4, 4
+AVGH 4, 8
+AVGH 8, 4
+AVGH 8, 8
+AVGH 8, 16
+AVGH 16, 8
+AVGH 16, 16
+
+// 0 < weight < 64
+.macro load_weights_add_add
+ vdup.8 d30, ip
+ vdup.8 d31, r6
+.endm
+
+.macro load_add_add d1 d2
+ vld1.32 {\d1}, [r2], r3
+ vld1.32 {\d2}, [r4], r5
+.endm
+
+.macro weight_add_add dst s1 s2
+ vmull.u8 \dst, \s1, d30
+ vmlal.u8 \dst, \s2, d31
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+ rsb r6, #0
+ vdup.8 d30, ip
+ vdup.8 d31, r6
+.endm
+
+.macro load_add_sub d1 d2
+ vld1.32 {\d1}, [r2], r3
+ vld1.32 {\d2}, [r4], r5
+.endm
+
+.macro weight_add_sub dst s1 s2
+ vmull.u8 \dst, \s1, d30
+ vmlsl.u8 \dst, \s2, d31
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+ rsb ip, #0
+ vdup.8 d31, r6
+ vdup.8 d30, ip
+.endm
+
+.macro load_sub_add d1 d2
+ vld1.32 {\d2}, [r4], r5
+ vld1.32 {\d1}, [r2], r3
+.endm
+
+.macro weight_sub_add dst s1 s2
+ vmull.u8 \dst, \s2, d31
+ vmlsl.u8 \dst, \s1, d30
+.endm
+
+.macro AVG_WEIGHT ext
+function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
+ load_weights_\ext
+1: // height loop
+ subs lr, lr, #2
+ load_\ext d0[], d1[]
+ weight_\ext q8, d0, d1
+ load_\ext d2[], d3[]
+ vqrshrun.s16 d0, q8, #6
+ weight_\ext q9, d2, d3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vqrshrun.s16 d1, q9, #6
+ vst1.32 {d1[0]}, [r0,:32], r1
+ bgt 1b
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
+ load_weights_\ext
+1: // height loop
+ subs lr, lr, #4
+ load_\ext d0, d1
+ weight_\ext q8, d0, d1
+ load_\ext d2, d3
+ weight_\ext q9, d2, d3
+ load_\ext d4, d5
+ weight_\ext q10, d4, d5
+ load_\ext d6, d7
+ weight_\ext q11, d6, d7
+ vqrshrun.s16 d0, q8, #6
+ vqrshrun.s16 d1, q9, #6
+ vqrshrun.s16 d2, q10, #6
+ vqrshrun.s16 d3, q11, #6
+ vst1.64 {d0}, [r0,:64], r1
+ vst1.64 {d1}, [r0,:64], r1
+ vst1.64 {d2}, [r0,:64], r1
+ vst1.64 {d3}, [r0,:64], r1
+ bgt 1b
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_weight_w16_\ext\()_neon, export=1
+ load_weights_\ext
+1: // height loop
+ subs lr, lr, #2
+ load_\ext d0-d1, d2-d3
+ weight_\ext q8, d0, d2
+ weight_\ext q9, d1, d3
+ load_\ext d4-d5, d6-d7
+ weight_\ext q10, d4, d6
+ weight_\ext q11, d5, d7
+ vqrshrun.s16 d0, q8, #6
+ vqrshrun.s16 d1, q9, #6
+ vqrshrun.s16 d2, q10, #6
+ vqrshrun.s16 d3, q11, #6
+ vst1.64 {d0-d1}, [r0,:128], r1
+ vst1.64 {d2-d3}, [r0,:128], r1
+ bgt 1b
+ pop {r4-r6,pc}
+.endfunc
+.endm
+
+AVG_WEIGHT add_add
+AVG_WEIGHT add_sub
+AVG_WEIGHT sub_add
+
+function x264_pixel_avg_w4_neon, export=1
+ subs lr, lr, #2
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d2[]}, [r4], r5
+ vrhadd.u8 d0, d0, d2
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d3[]}, [r4], r5
+ vrhadd.u8 d1, d1, d3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ bgt x264_pixel_avg_w4_neon
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_w8_neon, export=1
+ subs lr, lr, #4
+ vld1.64 {d0}, [r2], r3
+ vld1.64 {d2}, [r4], r5
+ vrhadd.u8 d0, d0, d2
+ vld1.64 {d1}, [r2], r3
+ vld1.64 {d3}, [r4], r5
+ vrhadd.u8 d1, d1, d3
+ vst1.64 {d0}, [r0,:64], r1
+ vld1.64 {d2}, [r2], r3
+ vld1.64 {d4}, [r4], r5
+ vrhadd.u8 d2, d2, d4
+ vst1.64 {d1}, [r0,:64], r1
+ vld1.64 {d3}, [r2], r3
+ vld1.64 {d5}, [r4], r5
+ vrhadd.u8 d3, d3, d5
+ vst1.64 {d2}, [r0,:64], r1
+ vst1.64 {d3}, [r0,:64], r1
+ bgt x264_pixel_avg_w8_neon
+ pop {r4-r6,pc}
+.endfunc
+
+function x264_pixel_avg_w16_neon, export=1
+ subs lr, lr, #4
+ vld1.64 {d0-d1}, [r2], r3
+ vld1.64 {d2-d3}, [r4], r5
+ vrhadd.u8 q0, q0, q1
+ vld1.64 {d2-d3}, [r2], r3
+ vld1.64 {d4-d5}, [r4], r5
+ vrhadd.u8 q1, q1, q2
+ vst1.64 {d0-d1}, [r0,:128], r1
+ vld1.64 {d4-d5}, [r2], r3
+ vld1.64 {d6-d7}, [r4], r5
+ vrhadd.u8 q2, q2, q3
+ vst1.64 {d2-d3}, [r0,:128], r1
+ vld1.64 {d6-d7}, [r2], r3
+ vld1.64 {d0-d1}, [r4], r5
+ vrhadd.u8 q3, q3, q0
+ vst1.64 {d4-d5}, [r0,:128], r1
+ vst1.64 {d6-d7}, [r0,:128], r1
+ bgt x264_pixel_avg_w16_neon
+ pop {r4-r6,pc}
+.endfunc
+
+
+function x264_pixel_avg2_w4_neon, export=1
+ ldr ip, [sp, #4]
+ push {lr}
+ ldr lr, [sp, #4]
+avg2_w4_loop:
+ subs ip, ip, #2
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d2[]}, [lr], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d3[]}, [lr], r3
+ vrhadd.u8 d1, d1, d3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ bgt avg2_w4_loop
+ pop {pc}
+.endfunc
+
+function x264_pixel_avg2_w8_neon, export=1
+ ldr ip, [sp, #4]
+ push {lr}
+ ldr lr, [sp, #4]
+avg2_w8_loop:
+ subs ip, ip, #2
+ vld1.64 {d0}, [r2], r3
+ vld1.64 {d2}, [lr], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.64 {d1}, [r2], r3
+ vld1.64 {d3}, [lr], r3
+ vrhadd.u8 d1, d1, d3
+ vst1.64 {d0}, [r0,:64], r1
+ vst1.64 {d1}, [r0,:64], r1
+ bgt avg2_w8_loop
+ pop {pc}
+.endfunc
+
+function x264_pixel_avg2_w16_neon, export=1
+ ldr ip, [sp, #4]
+ push {lr}
+ ldr lr, [sp, #4]
+avg2_w16_loop:
+ subs ip, ip, #2
+ vld1.64 {d0-d1}, [r2], r3
+ vld1.64 {d2-d3}, [lr], r3
+ vrhadd.u8 q0, q0, q1
+ vld1.64 {d4-d5}, [r2], r3
+ vld1.64 {d6-d7}, [lr], r3
+ vrhadd.u8 q2, q2, q3
+ vst1.64 {d0-d1}, [r0,:128], r1
+ vst1.64 {d4-d5}, [r0,:128], r1
+ bgt avg2_w16_loop
+ pop {pc}
+.endfunc
+
+function x264_pixel_avg2_w20_neon, export=1
+ ldr ip, [sp, #4]
+ push {lr}
+ sub r1, r1, #16
+ ldr lr, [sp, #4]
+avg2_w20_loop:
+ subs ip, ip, #2
+ vld1.64 {d0-d2}, [r2], r3
+ vld1.64 {d4-d6}, [lr], r3
+ vrhadd.u8 q0, q0, q2
+ vrhadd.u8 d2, d2, d6
+ vld1.64 {d4-d6}, [r2], r3
+ vld1.64 {d16-d18},[lr], r3
+ vrhadd.u8 q2, q2, q8
+ vst1.64 {d0-d1}, [r0,:128]!
+ vrhadd.u8 d6, d6, d18
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.64 {d4-d5}, [r0,:128]!
+ vst1.32 {d6[0]}, [r0,:32], r1
+ bgt avg2_w20_loop
+ pop {pc}
+.endfunc
+
+
+// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height )
+function x264_mc_copy_w4_neon, export=1
+ ldr ip, [sp]
+copy_w4_loop:
+ subs ip, ip, #4
+ vld1.32 {d0[]}, [r2], r3
+ vld1.32 {d1[]}, [r2], r3
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vst1.32 {d0[0]}, [r0,:32], r1
+ vst1.32 {d1[0]}, [r0,:32], r1
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ bgt copy_w4_loop
+ bx lr
+.endfunc
+
+function x264_mc_copy_w8_neon, export=1
+ ldr ip, [sp]
+copy_w8_loop:
+ subs ip, ip, #4
+ vld1.32 {d0}, [r2], r3
+ vld1.32 {d1}, [r2], r3
+ vld1.32 {d2}, [r2], r3
+ vld1.32 {d3}, [r2], r3
+ vst1.32 {d0}, [r0,:64], r1
+ vst1.32 {d1}, [r0,:64], r1
+ vst1.32 {d2}, [r0,:64], r1
+ vst1.32 {d3}, [r0,:64], r1
+ bgt copy_w8_loop
+ bx lr
+.endfunc
+
+function x264_mc_copy_w16_neon, export=1
+ ldr ip, [sp]
+copy_w16_loop:
+ subs ip, ip, #4
+ vld1.32 {d0-d1}, [r2], r3
+ vld1.32 {d2-d3}, [r2], r3
+ vld1.32 {d4-d5}, [r2], r3
+ vld1.32 {d6-d7}, [r2], r3
+ vst1.32 {d0-d1}, [r0,:128], r1
+ vst1.32 {d2-d3}, [r0,:128], r1
+ vst1.32 {d4-d5}, [r0,:128], r1
+ vst1.32 {d6-d7}, [r0,:128], r1
+ bgt copy_w16_loop
+ bx lr
+.endfunc
+
+function x264_mc_copy_w16_aligned_neon, export=1
+ ldr ip, [sp]
+copy_w16_aligned_loop:
+ subs ip, ip, #4
+ vld1.32 {d0-d1}, [r2,:128], r3
+ vld1.32 {d2-d3}, [r2,:128], r3
+ vld1.32 {d4-d5}, [r2,:128], r3
+ vld1.32 {d6-d7}, [r2,:128], r3
+ vst1.32 {d0-d1}, [r0,:128], r1
+ vst1.32 {d2-d3}, [r0,:128], r1
+ vst1.32 {d4-d5}, [r0,:128], r1
+ vst1.32 {d6-d7}, [r0,:128], r1
+ bgt copy_w16_aligned_loop
+ bx lr
+.endfunc
+
+
+// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
+// uint8_t *src, int i_src_stride,
+// int dx, int dy, int i_width, int i_height );
+function x264_mc_chroma_neon, export=1
+ push {r4-r6, lr}
+ ldrd r4, [sp, #16]
+ ldr r6, [sp, #24]
+
+ asr lr, r5, #3
+ mul lr, r3, lr
+ add r2, r2, r4, asr #3
+ cmp r6, #4
+ add r2, r2, lr
+
+ and r4, r4, #7
+ and r5, r5, #7
+ pld [r2]
+ pld [r2, r3]
+
+ bgt mc_chroma_w8
+ beq mc_chroma_w4
+
+// calculate cA cB cC cD
+.macro CHROMA_MC_START r0 r1
+ muls lr, r4, r5
+ rsb r6, lr, r5, lsl #3
+ rsb ip, lr, r4, lsl #3
+ sub r4, lr, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r2, r3
+
+ vdup.8 d0, r4
+ lsl r3, r3, #1
+ vdup.8 d1, ip
+ vld1.64 {\r0}, [r2], r3
+ vdup.8 d2, r6
+ vld1.64 {\r1}, [r5], r3
+ vdup.8 d3, lr
+ ldr r4, [sp, #28]
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+.endm
+
+.macro CHROMA_MC width, align
+mc_chroma_w\width:
+ CHROMA_MC_START d4, d6
+// since the element size varies, there's a different index for the 2nd store
+.if \width == 4
+ .set st2, 1
+.else
+ .set st2, 2
+.endif
+
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+1: // height loop, interpolate xy
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+ vld1.64 {d4}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+ vld1.64 {d6}, [r5], r3
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+ subs r4, r4, #2
+ pld [r2]
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ vst1.\align {d16[0]}, [r0,:\align], r1
+ vst1.\align {d16[st2]}, [r0,:\align], r1
+ bgt 1b
+
+ pop {r4-r6, pc}
+
+2: // dx or dy are 0
+ tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ vtrn.32 d0, d1
+ ldr r4, [sp, #28]
+
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+ add r5, r2, r3
+ lsl r3, r3, #1
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d4[1]}, [r5], r3
+
+3: // vertical interpolation loop
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vld1.32 {d4[0]}, [r2], r3
+ vmull.u8 q9, d4, d1
+ vld1.32 {d4[1]}, [r5], r3
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+ subs r4, r4, #2
+ pld [r2]
+ vst1.\align {d16[0]}, [r0,:\align], r1
+ vst1.\align {d16[st2]}, [r0,:\align], r1
+ bgt 3b
+
+ pop {r4-r6, pc}
+
+4: // dy is 0
+ vld1.64 {d4}, [r2], r3
+ vld1.64 {d6}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+5: // horizontal interpolation loop
+ vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r4, r4, #2
+ vld1.64 {d4}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r2]
+ vrshrn.u16 d16, q8, #6
+ vld1.64 {d6}, [r2], r3
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ pld [r2]
+ vst1.\align {d16[0]}, [r0,:\align], r1
+ vst1.\align {d16[st2]}, [r0,:\align], r1
+ bgt 5b
+
+ pop {r4-r6, pc}
+.endm
+
+ CHROMA_MC 2, 16
+ CHROMA_MC 4, 32
+
+// the optimial timing for width 8 is different enough that it's not
+// readable to put it in the same macro as width 2/4
+mc_chroma_w8:
+ CHROMA_MC_START d4-d5, d6-d7
+
+1: // height loop, interpolate xy
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r2], r3
+ vmlal.u8 q8, d6, d2
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+ subs r4, r4, #2
+ vmlal.u8 q9, d7, d1
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+ vrshrn.u16 d16, q8, #6
+ vld1.64 {d6, d7}, [r5], r3
+ pld [r2]
+ vrshrn.u16 d17, q9, #6
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d17}, [r0,:64], r1
+ bgt 1b
+
+ pop {r4-r6, pc}
+
+2: // dx or dy are 0
+ tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ ldr r4, [sp, #28]
+
+ beq 4f
+
+ add r5, r2, r3
+ lsl r3, r3, #1
+ vld1.64 {d4}, [r2], r3
+ vld1.64 {d6}, [r5], r3
+
+3: // vertical interpolation loop
+ pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+ vld1.64 {d4}, [r2], r3
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d1
+ vld1.64 {d6}, [r5], r3
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ subs r4, r4, #2
+ pld [r2]
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d17}, [r0,:64], r1
+ bgt 3b
+
+ pop {r4-r6, pc}
+
+4: // dy is 0
+ vld1.64 {d4, d5}, [r2], r3
+ vld1.64 {d6, d7}, [r2], r3
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+5: // horizontal interpolation loop
+ pld [r2]
+ subs r4, r4, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r2], r3
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d7, d1
+ pld [r2]
+ vext.8 d5, d4, d5, #1
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ vld1.64 {d6, d7}, [r2], r3
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r1
+ vst1.64 {d17}, [r0,:64], r1
+ bgt 5b
+
+ pop {r4-r6, pc}
+.endfunc
+
+
+// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
+function x264_hpel_filter_v_neon, export=1
+ ldr ip, [sp]
+ sub r1, r1, r3, lsl #1
+ push {lr}
+ add lr, r1, ip
+ vmov.u8 d30, #5
+ vmov.u8 d31, #20
+
+filter_v_loop:
+ subs ip, ip, #16
+ vld1.64 {d0-d1}, [r1,:128], r3
+ vld1.64 {d2-d3}, [r1,:128], r3
+ vld1.64 {d4-d5}, [r1,:128], r3
+ vld1.64 {d6-d7}, [r1,:128], r3
+ vld1.64 {d16-d17}, [r1,:128], r3
+ vld1.64 {d18-d19}, [r1,:128], r3
+ sub r1, lr, ip
+
+ vaddl.u8 q10, d0, d18
+ vmlsl.u8 q10, d2, d30
+ vmlal.u8 q10, d4, d31
+ vmlal.u8 q10, d6, d31
+ vmlsl.u8 q10, d16, d30
+
+ vaddl.u8 q11, d1, d19
+ vmlsl.u8 q11, d3, d30
+ vmlal.u8 q11, d5, d31
+ vmlal.u8 q11, d7, d31
+ vmlsl.u8 q11, d17, d30
+
+ vqrshrun.s16 d0, q10, #5
+ vst1.64 {d20-d21}, [r2,:128]!
+ vqrshrun.s16 d1, q11, #5
+ vst1.64 {d22-d23}, [r2,:128]!
+ vst1.64 {d0-d1}, [r0,:128]!
+ bgt filter_v_loop
+ pop {pc}
+.endfunc
+
+// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
+function x264_hpel_filter_c_neon, export=1
+ sub r1, #16
+ vld1.64 {d0-d3}, [r1,:128]!
+
+ // unrolled 2x: 4% faster
+filter_c_loop:
+ subs r2, r2, #16
+ vld1.64 {d4-d7}, [r1,:128]!
+ vext.16 q8, q0, q1, #6
+ vext.16 q12, q1, q2, #3
+ vadd.s16 q8, q8, q12
+ vext.16 q9, q0, q1, #7
+ vext.16 q11, q1, q2, #2
+ vadd.s16 q9, q9, q11
+ vext.16 q10, q1, q2, #1
+ vext.16 q11, q1, q2, #6
+ vadd.s16 q10, q1, q10
+ vsub.s16 q8, q8, q9 // a-b
+ vext.16 q15, q2, q3, #3
+ vsub.s16 q9, q9, q10 // b-c
+
+ vext.16 q12, q1, q2, #7
+ vshr.s16 q8, q8, #2 // (a-b)/4
+ vadd.s16 q11, q11, q15
+ vext.16 q14, q2, q3, #2
+ vsub.s16 q8, q8, q9 // (a-b)/4-b+c
+ vadd.s16 q12, q12, q14
+ vext.16 q13, q2, q3, #1
+
+ vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4
+ vadd.s16 q13, q2, q13
+ vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vsub.s16 q11, q11, q12 // a-b
+ vsub.s16 q12, q12, q13 // b-c
+ vshr.s16 q11, q11, #2 // (a-b)/4
+ vqrshrun.s16 d30, q8, #6
+ vsub.s16 q11, q11, q12 // (a-b)/4-b+c
+ vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4
+ vld1.64 {d0-d3}, [r1,:128]!
+ vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+ vext.16 q8, q2, q3, #6
+ vqrshrun.s16 d31, q11, #6
+ vext.16 q12, q3, q0, #3
+ vadd.s16 q8, q8, q12
+ vext.16 q9, q2, q3, #7
+ vst1.64 {d30-d31}, [r0,:128]!
+ bxle lr
+ subs r2, r2, #16
+
+ vext.16 q11, q3, q0, #2
+ vadd.s16 q9, q9, q11
+ vext.16 q10, q3, q0, #1
+ vext.16 q11, q3, q0, #6
+ vadd.s16 q10, q3, q10
+ vsub.s16 q8, q8, q9 // a-b
+ vext.16 q15, q0, q1, #3
+ vsub.s16 q9, q9, q10 // b-c
+
+ vext.16 q12, q3, q0, #7
+ vshr.s16 q8, q8, #2 // (a-b)/4
+ vadd.s16 q11, q11, q15
+ vext.16 q14, q0, q1, #2
+ vsub.s16 q8, q8, q9 // (a-b)/4-b+c
+ vadd.s16 q12, q12, q14
+ vext.16 q13, q0, q1, #1
+
+ vshr.s16 q8, q8, #2 // ((a-b)/4-b+c)/4
+ vadd.s16 q13, q0, q13
+ vadd.s16 q8, q8, q10 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ vsub.s16 q11, q11, q12 // a-b
+ vsub.s16 q12, q12, q13 // b-c
+ vshr.s16 q11, q11, #2 // (a-b)/4
+ vqrshrun.s16 d30, q8, #6
+ vsub.s16 q11, q11, q12 // (a-b)/4-b+c
+ vshr.s16 q11, q11, #2 // ((a-b)/4-b+c)/4
+ vadd.s16 q11, q11, q13 // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+ vqrshrun.s16 d31, q11, #6
+ vst1.64 {d30-d31}, [r0,:128]!
+ bgt filter_c_loop
+ bx lr
+.endfunc
+
+// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
+function x264_hpel_filter_h_neon, export=1
+ sub r1, #16
+ vmov.u8 d30, #5
+ vld1.64 {d0-d3}, [r1,:128]!
+ vmov.u8 d31, #20
+
+ // unrolled 3x because it's 5% faster, due to mitigating
+ // the high latency of multiplication and vqrshrun
+filter_h_loop:
+ subs r2, r2, #16
+ vld1.64 {d4-d5}, [r1,:128]!
+ vext.8 q8, q0, q1, #14
+ vext.8 q12, q1, q2, #3
+ vaddl.u8 q13, d16, d24
+ vext.8 q9, q0, q1, #15
+ vaddl.u8 q14, d17, d25
+
+ vext.8 q10, q1, q2, #1
+ vmlal.u8 q13, d2, d31
+ vmlsl.u8 q13, d18, d30
+ vext.8 q11, q1, q2, #2
+ vmlal.u8 q13, d20, d31
+ vmlsl.u8 q13, d22, d30
+
+ vmlsl.u8 q14, d19, d30
+ vmlal.u8 q14, d3, d31
+ vmlal.u8 q14, d21, d31
+ vmlsl.u8 q14, d23, d30
+ vqrshrun.s16 d6, q13, #5
+
+ vld1.64 {d0-d1}, [r1,:128]!
+ vext.8 q8, q1, q2, #14
+ vext.8 q12, q2, q0, #3
+ vaddl.u8 q13, d16, d24
+ vqrshrun.s16 d7, q14, #5
+ vext.8 q9, q1, q2, #15
+ vaddl.u8 q14, d17, d25
+
+ vst1.64 {d6-d7}, [r0,:128]!
+ bxle lr
+ subs r2, r2, #16
+
+ vext.8 q10, q2, q0, #1
+ vmlal.u8 q13, d4, d31
+ vmlsl.u8 q13, d18, d30
+ vext.8 q11, q2, q0, #2
+ vmlal.u8 q13, d20, d31
+ vmlsl.u8 q13, d22, d30
+
+ vmlsl.u8 q14, d19, d30
+ vmlal.u8 q14, d5, d31
+ vmlal.u8 q14, d21, d31
+ vmlsl.u8 q14, d23, d30
+ vqrshrun.s16 d6, q13, #5
+
+ vld1.64 {d2-d3}, [r1,:128]!
+ vext.8 q8, q2, q0, #14
+ vext.8 q12, q0, q1, #3
+ vaddl.u8 q13, d16, d24
+ vqrshrun.s16 d7, q14, #5
+ vext.8 q9, q2, q0, #15
+ vaddl.u8 q14, d17, d25
+
+ vst1.64 {d6-d7}, [r0,:128]!
+ bxle lr
+ subs r2, r2, #16
+
+ vext.8 q10, q0, q1, #1
+ vmlal.u8 q13, d0, d31
+ vmlsl.u8 q13, d18, d30
+ vext.8 q11, q0, q1, #2
+ vmlal.u8 q13, d20, d31
+ vmlsl.u8 q13, d22, d30
+
+ vmlsl.u8 q14, d19, d30
+ vmlal.u8 q14, d1, d31
+ vmlal.u8 q14, d21, d31
+ vmlsl.u8 q14, d23, d30
+
+ vqrshrun.s16 d6, q13, #5
+ vqrshrun.s16 d7, q14, #5
+ vst1.64 {d6-d7}, [r0,:128]!
+ bgt filter_h_loop
+ bx lr
+.endfunc
+
+
+// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
+// uint8_t *dstc, int src_stride, int dst_stride, int width,
+// int height )
+function x264_frame_init_lowres_core_neon, export=1
+ push {r4-r10,lr}
+ vpush {d8-d15}
+ ldrd r4, [sp, #96]
+ ldrd r6, [sp, #104]
+ ldr lr, [sp, #112]
+ sub r10, r6, r7 // dst_stride - width
+ and r10, r10, #~15
+
+lowres_yloop:
+ mov ip, r7 // width
+ mov r6, r0 // src0
+ add r8, r0, r5 // src1 = src0 + src_stride
+ add r9, r0, r5, lsl #1 // src2 = src1 + src_stride
+
+ vld2.8 {d8, d10}, [r6,:128]!
+ vld2.8 {d12,d14}, [r8,:128]!
+ vld2.8 {d16,d18}, [r9,:128]!
+
+lowres_xloop:
+ subs ip, ip, #16
+
+ vld2.8 {d9, d11}, [r6,:128]!
+ vld2.8 {d13,d15}, [r8,:128]!
+ vrhadd.u8 q0, q4, q6
+ vld2.8 {d17,d19}, [r9,:128]!
+ vrhadd.u8 q5, q5, q7
+ vld2.8 {d20,d22}, [r6,:128]!
+ vrhadd.u8 q1, q6, q8
+ vld2.8 {d24,d26}, [r8,:128]!
+ vrhadd.u8 q7, q7, q9
+ vext.8 q4, q4, q10, #1
+ vrhadd.u8 q0, q0, q5
+ vext.8 q6, q6, q12, #1
+ vrhadd.u8 q1, q1, q7
+ vld2.8 {d28,d30}, [r9,:128]!
+ vrhadd.u8 q4, q4, q6
+ vext.8 q8, q8, q14, #1
+ vrhadd.u8 q6, q6, q8
+ vst1.64 {d0-d1}, [r1,:128]!
+ vrhadd.u8 q2, q4, q5
+ vst1.64 {d2-d3}, [r3,:128]!
+ vrhadd.u8 q3, q6, q7
+ vst1.64 {d4-d5}, [r2,:128]!
+ vst1.64 {d6-d7}, [r4,:128]!
+
+ ble lowres_xloop_end
+ subs ip, ip, #16
+
+ vld2.8 {d21,d23}, [r6,:128]!
+ vld2.8 {d25,d27}, [r8,:128]!
+ vrhadd.u8 q0, q10, q12
+ vld2.8 {d29,d31}, [r9,:128]!
+ vrhadd.u8 q11, q11, q13
+ vld2.8 {d8, d10}, [r6,:128]!
+ vrhadd.u8 q1, q12, q14
+ vld2.8 {d12,d14}, [r8,:128]!
+ vrhadd.u8 q13, q13, q15
+ vext.8 q10, q10, q4, #1
+ vrhadd.u8 q0, q0, q11
+ vext.8 q12, q12, q6, #1
+ vrhadd.u8 q1, q1, q13
+ vld2.8 {d16,d18}, [r9,:128]!
+ vrhadd.u8 q10, q10, q12
+ vext.8 q14, q14, q8, #1
+ vrhadd.u8 q12, q12, q14
+ vst1.64 {d0-d1}, [r1,:128]!
+ vrhadd.u8 q2, q10, q11
+ vst1.64 {d2-d3}, [r3,:128]!
+ vrhadd.u8 q3, q12, q13
+ vst1.64 {d4-d5}, [r2,:128]!
+ vst1.64 {d6-d7}, [r4,:128]!
+
+ bgt lowres_xloop
+
+lowres_xloop_end:
+ subs lr, lr, #1
+ add r0, r0, r5, lsl #1
+ add r1, r1, r10
+ add r2, r2, r10
+ add r3, r3, r10
+ add r4, r4, r10
+ bgt lowres_yloop
+
+ vpop {d8-d15}
+ pop {r4-r10,pc}
+.endfunc
--- /dev/null
+/*****************************************************************************
+ * mc-c.c: h264 encoder library (Motion Compensation)
+ *****************************************************************************
+ * Copyright (C) 2009 x264 project
+ *
+ * Authors: David Conrad <lessen42@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+void x264_prefetch_ref_arm( uint8_t *, int, int );
+void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
+
+void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
+
+void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
+
+void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+
+void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
+{
+ NULL,
+ x264_pixel_avg2_w4_neon,
+ x264_pixel_avg2_w8_neon,
+ x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
+ x264_pixel_avg2_w16_neon,
+ x264_pixel_avg2_w20_neon,
+};
+
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
+{
+ NULL,
+ x264_mc_copy_w4_neon,
+ x264_mc_copy_w8_neon,
+ NULL,
+ x264_mc_copy_w16_neon,
+};
+
+static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+static void mc_luma_neon( uint8_t *dst, int i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height )
+{
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+ if ( (mvy&3) == 3 ) // explict if() to force conditional add
+ src1 += i_src_stride;
+
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
+ {
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ x264_pixel_avg_wtab_neon[i_width>>2](
+ dst, i_dst_stride, src1, i_src_stride,
+ src2, i_height );
+ }
+ else
+ {
+ x264_mc_copy_wtab_neon[i_width>>2](
+ dst, i_dst_stride, src1, i_src_stride, i_height );
+ }
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
+ uint8_t *src[4], int i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height )
+{
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ int offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+ if ( (mvy&3) == 3 ) // explict if() to force conditional add
+ src1 += i_src_stride;
+
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
+ {
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ x264_pixel_avg_wtab_neon[i_width>>2](
+ dst, *i_dst_stride, src1, i_src_stride,
+ src2, i_height );
+ return dst;
+ }
+ else
+ {
+ *i_dst_stride = i_src_stride;
+ return src1;
+ }
+}
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ int stride, int width, int height, int16_t *buf )
+{
+ int realign = (intptr_t)src & 15;
+ src -= realign;
+ dstv -= realign;
+ dstc -= realign;
+ dsth -= realign;
+ width += realign;
+ while( height-- )
+ {
+ x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
+ x264_hpel_filter_c_neon( dstc, buf+8, width );
+ x264_hpel_filter_h_neon( dsth, src, width );
+ dsth += stride;
+ dstv += stride;
+ dstc += stride;
+ src += stride;
+ }
+}
+
+void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
+{
+ if( !(cpu&X264_CPU_ARMV6) )
+ return;
+
+ pf->prefetch_fenc = x264_prefetch_fenc_arm;
+ pf->prefetch_ref = x264_prefetch_ref_arm;
+
+ if( !(cpu&X264_CPU_NEON) )
+ return;
+
+ pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
+ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
+ pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
+
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
+
+ pf->memcpy_aligned = x264_memcpy_aligned_neon;
+ pf->memzero_aligned = x264_memzero_aligned_neon;
+
+ pf->mc_chroma = x264_mc_chroma_neon;
+ pf->mc_luma = mc_luma_neon;
+ pf->get_ref = get_ref_neon;
+ pf->hpel_filter = hpel_filter_neon;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+}