+endfunc
+
+function x264_plane_copy_deinterleave_rgb_neon
+ push {r4-r8, r10, r11, lr}
+ ldrd r4, r5, [sp, #32]
+ ldrd r6, r7, [sp, #40]
+ ldr r8, [sp, #48]
+ ldrd r10, r11, [sp, #52]
+ add lr, r10, #7
+ subs r8, r8, #3
+ bic lr, lr, #7
+ sub r7, r7, lr, lsl #1
+ sub r1, r1, lr
+ sub r3, r3, lr
+ sub r5, r5, lr
+ subne r7, r7, lr, lsl #1
+ subeq r7, r7, lr
+ bne block4
+block3:
+ vld3.8 {d0,d1,d2}, [r6]!
+ subs lr, lr, #8
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d1}, [r2]!
+ vst1.8 {d2}, [r4]!
+ bgt block3
+
+ subs r11, r11, #1
+ add r0, r0, r1
+ add r2, r2, r3
+ add r4, r4, r5
+ add r6, r6, r7
+ mov lr, r10
+ bgt block3
+
+ pop {r4-r8, r10, r11, pc}
+block4:
+ vld4.8 {d0,d1,d2,d3}, [r6]!
+ subs lr, lr, #8
+ vst1.8 {d0}, [r0]!
+ vst1.8 {d1}, [r2]!
+ vst1.8 {d2}, [r4]!
+ bgt block4
+
+ subs r11, r11, #1
+ add r0, r0, r1
+ add r2, r2, r3
+ add r4, r4, r5
+ add r6, r6, r7
+ mov lr, r10
+ bgt block4
+
+ pop {r4-r8, r10, r11, pc}
+endfunc
+
+function x264_plane_copy_interleave_neon
+ push {r4-r7, lr}
+ ldrd r6, r7, [sp, #28]
+ ldrd r4, r5, [sp, #20]
+ add lr, r6, #15
+ bic lr, lr, #15
+ sub r1, r1, lr, lsl #1
+ sub r3, r3, lr
+ sub r5, r5, lr
+blocki:
+ vld1.8 {q0}, [r2]!
+ vld1.8 {q1}, [r4]!
+ subs lr, lr, #16
+ vst2.8 {d0,d2}, [r0]!
+ vst2.8 {d1,d3}, [r0]!
+ bgt blocki
+
+ subs r7, r7, #1
+ add r0, r0, r1
+ add r2, r2, r3
+ add r4, r4, r5
+ mov lr, r6
+ bgt blocki
+
+ pop {r4-r7, pc}
+endfunc
+
+function x264_plane_copy_swap_neon
+ push {r4-r5, lr}
+ ldrd r4, r5, [sp, #12]
+ add lr, r4, #15
+ bic lr, lr, #15
+ sub r1, r1, lr, lsl #1
+ sub r3, r3, lr, lsl #1
+1:
+ vld1.8 {q0, q1}, [r2]!
+ subs lr, lr, #16
+ vrev16.8 q0, q0
+ vrev16.8 q1, q1
+ vst1.8 {q0, q1}, [r0]!
+ bgt 1b
+
+ subs r5, r5, #1
+ add r0, r0, r1
+ add r2, r2, r3
+ mov lr, r4
+ bgt 1b
+
+ pop {r4-r5, pc}
+endfunc
+
+function x264_store_interleave_chroma_neon
+ push {lr}
+ ldr lr, [sp, #4]
+ mov ip, #FDEC_STRIDE
+1:
+ vld1.8 {d0}, [r2], ip
+ vld1.8 {d1}, [r3], ip
+ subs lr, lr, #1
+ vst2.8 {d0,d1}, [r0,:128], r1
+ bgt 1b
+
+ pop {pc}
+endfunc
+
+.macro integral4h p1, p2
+ vext.8 d1, \p1, \p2, #1
+ vext.8 d2, \p1, \p2, #2
+ vext.8 d3, \p1, \p2, #3
+ vaddl.u8 q0, \p1, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 q0, q0, q2
+.endm
+
+function integral_init4h_neon
+ sub r3, r0, r2, lsl #1
+ vld1.8 {d6, d7}, [r1, :128]!
+1:
+ subs r2, r2, #16
+ vld1.16 {q2}, [r3, :128]!
+ integral4h d6, d7
+ vld1.8 {d6}, [r1, :64]!
+ vld1.16 {q2}, [r3, :128]!
+ vst1.16 {q0}, [r0, :128]!
+ integral4h d7, d6
+ vld1.8 {d7}, [r1, :64]!
+ vst1.16 {q0}, [r0, :128]!
+ bgt 1b
+ bx lr
+endfunc
+
+.macro integral8h p1, p2, s
+ vext.8 d1, \p1, \p2, #1
+ vext.8 d2, \p1, \p2, #2
+ vext.8 d3, \p1, \p2, #3
+ vext.8 d4, \p1, \p2, #4
+ vext.8 d5, \p1, \p2, #5
+ vext.8 d6, \p1, \p2, #6
+ vext.8 d7, \p1, \p2, #7
+ vaddl.u8 q0, \p1, d1
+ vaddl.u8 q1, d2, d3
+ vaddl.u8 q2, d4, d5
+ vaddl.u8 q3, d6, d7
+ vadd.u16 q0, q0, q1
+ vadd.u16 q2, q2, q3
+ vadd.u16 q0, q0, q2
+ vadd.u16 q0, q0, \s
+.endm
+
+function integral_init8h_neon
+ sub r3, r0, r2, lsl #1
+ vld1.8 {d16, d17}, [r1, :128]!
+1:
+ subs r2, r2, #16
+ vld1.16 {q9}, [r3, :128]!
+ integral8h d16, d17, q9
+ vld1.8 {d16}, [r1, :64]!
+ vld1.16 {q9}, [r3, :128]!
+ vst1.16 {q0}, [r0, :128]!
+ integral8h d17, d16, q9
+ vld1.8 {d17}, [r1, :64]!
+ vst1.16 {q0}, [r0, :128]!
+ bgt 1b
+ bx lr
+endfunc
+
+function integral_init4v_neon
+ push {r4-r5}
+ mov r3, r0
+ add r4, r0, r2, lsl #3
+ add r5, r0, r2, lsl #4
+ sub r2, r2, #8
+ vld1.16 {q11, q12}, [r3]!
+ vld1.16 {q8, q9}, [r5]!
+ vld1.16 {q13}, [r3]!
+ vld1.16 {q10}, [r5]!
+1:
+ subs r2, r2, #16
+ vld1.16 {q14, q15}, [r4]!
+ vext.8 q0, q11, q12, #8
+ vext.8 q1, q12, q13, #8
+ vext.8 q2, q8, q9, #8
+ vext.8 q3, q9, q10, #8
+ vsub.u16 q14, q14, q11
+ vsub.u16 q15, q15, q12
+ vadd.u16 q0, q0, q11
+ vadd.u16 q1, q1, q12
+ vadd.u16 q2, q2, q8
+ vadd.u16 q3, q3, q9
+ vst1.16 {q14}, [r1]!
+ vst1.16 {q15}, [r1]!
+ vmov q11, q13
+ vmov q8, q10
+ vsub.u16 q0, q2, q0
+ vsub.u16 q1, q3, q1
+ vld1.16 {q12, q13}, [r3]!
+ vld1.16 {q9, q10}, [r5]!
+ vst1.16 {q0}, [r0]!
+ vst1.16 {q1}, [r0]!
+ bgt 1b
+2:
+ pop {r4-r5}
+ bx lr
+endfunc
+
+function integral_init8v_neon
+ add r2, r0, r1, lsl #4
+ sub r1, r1, #8
+ ands r3, r1, #16 - 1
+ beq 1f
+ subs r1, r1, #8
+ vld1.16 {q0}, [r0]
+ vld1.16 {q2}, [r2]!
+ vsub.u16 q8, q2, q0
+ vst1.16 {q8}, [r0]!
+ ble 2f
+1:
+ subs r1, r1, #16
+ vld1.16 {q0, q1}, [r0]
+ vld1.16 {q2, q3}, [r2]!
+ vsub.u16 q8, q2, q0
+ vsub.u16 q9, q3, q1
+ vst1.16 {q8}, [r0]!
+ vst1.16 {q9}, [r0]!
+ bgt 1b
+2:
+ bx lr
+endfunc