Fix comment for mc_copy_neon.
Fix memzero_aligned_neon prototype.
Update NEON (i)dct_dc prototypes.
Duplicate x86 behavior for global+hidden functions.
%.o: %.S
$(AS) $(ASFLAGS) -o $@ $<
+ -@ $(STRIP) -x $@ # delete local/anonymous symbols, so they don't show up in oprofile
.depend: config.mak
rm -f .depend
#include "config.h"
+#ifdef __ELF__
+# define ELF
+#else
+# define ELF @
+#endif
+
.macro require8, val=1
- .eabi_attribute 24, \val
+ELF .eabi_attribute 24, \val
.endm
.macro preserve8, val=1
- .eabi_attribute 25, \val
+ELF .eabi_attribute 25, \val
.endm
- .macro function name, export=0
-.if \export
+ .macro function name
.global \name
-.endif
- .type \name, %function
+ELF .hidden \name
+ELF .type \name, %function
.func \name
\name:
.endm
// done in gas because .fpu neon overrides the refusal to assemble
// instructions the selected -march/-mcpu doesn't support
-function x264_cpu_neon_test, export=1
+function x264_cpu_neon_test
vadd.i16 q0, q0, q0
bx lr
.endfunc
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
// nonzero otherwise
-function x264_cpu_fast_neon_mrc_test, export=1
+function x264_cpu_fast_neon_mrc_test
// check for user access to performance counters
mrc p15, 0, r0, c9, c14, 0
cmp r0, #0
.endm
-function x264_dct4x4dc_neon, export=1
+function x264_dct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
bx lr
.endfunc
-function x264_idct4x4dc_neon, export=1
+function x264_idct4x4dc_neon
vld1.64 {d0-d3}, [r0,:128]
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
vsub.s16 \d3, \d7, \d5
.endm
-function x264_sub4x4_dct_neon, export=1
+function x264_sub4x4_dct_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.32 {d0[]}, [r1,:32], r3
bx lr
.endfunc
-function x264_sub8x4_dct_neon, export=1
+function x264_sub8x4_dct_neon
vld1.64 {d0}, [r1,:64], r3
vld1.64 {d1}, [r2,:64], ip
vsubl.u8 q8, d0, d1
bx lr
.endfunc
-function x264_sub8x8_dct_neon, export=1
+function x264_sub8x8_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
b x264_sub8x4_dct_neon
.endfunc
-function x264_sub16x16_dct_neon, export=1
+function x264_sub16x16_dct_neon
push {lr}
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
.endm
-function x264_sub8x8_dct8_neon, export=1
+function x264_sub8x8_dct8_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
bx lr
.endfunc
-function x264_sub16x16_dct8_neon, export=1
+function x264_sub16x16_dct8_neon
push {lr}
bl x264_sub8x8_dct8_neon
sub r1, r1, #FENC_STRIDE*8 - 8
vadd.s16 \d6, \d6, \d1
.endm
-function x264_add4x4_idct_neon, export=1
+function x264_add4x4_idct_neon
mov r2, #FDEC_STRIDE
vld1.64 {d0-d3}, [r1,:128]
bx lr
.endfunc
-function x264_add8x4_idct_neon, export=1
+function x264_add8x4_idct_neon
vld1.64 {d0-d3}, [r1,:128]!
IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
vld1.64 {d4-d7}, [r1,:128]!
bx lr
.endfunc
-function x264_add8x8_idct_neon, export=1
+function x264_add8x8_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl x264_add8x4_idct_neon
b x264_add8x4_idct_neon
.endfunc
-function x264_add16x16_idct_neon, export=1
+function x264_add16x16_idct_neon
mov r2, #FDEC_STRIDE
mov ip, lr
bl x264_add8x4_idct_neon
SUMSUB_AB q11, q12, q2, q12
.endm
-function x264_add8x8_idct8_neon, export=1
+function x264_add8x8_idct8_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16-d19}, [r1,:128]!
vld1.64 {d20-d23}, [r1,:128]!
bx lr
.endfunc
-function x264_add16x16_idct8_neon, export=1
+function x264_add16x16_idct8_neon
mov ip, lr
bl x264_add8x8_idct8_neon
sub r0, r0, #8*FDEC_STRIDE-8
.endfunc
-function x264_add8x8_idct_dc_neon, export=1
+function x264_add8x8_idct_dc_neon
mov r2, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64]
vrshr.s16 d16, d16, #6
vst1.64 {d22-d23}, [r2,:128], r3
.endm
-function x264_add16x16_idct_dc_neon, export=1
+function x264_add16x16_idct_dc_neon
mov r2, r0
mov r3, #FDEC_STRIDE
vmov.i16 q15, #0
bx lr
.endfunc
-function x264_sub8x8_dct_dc_neon, export=1
+function x264_sub8x8_dct_dc_neon
mov r3, #FENC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1,:64], r3
.endfunc
-function x264_zigzag_scan_4x4_frame_neon, export=1
+function x264_zigzag_scan_4x4_frame_neon
movrel r2, scan4x4_frame
vld1.64 {d0-d3}, [r1,:128]
vld1.64 {d16-d19}, [r2,:128]
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
-void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[2][2] );
+void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
-void x264_sub8x8_dct_dc_neon( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
vqmovun.s16 d1, q12
.endm
-function x264_deblock_v_luma_neon, export=1
+function x264_deblock_v_luma_neon
h264_loop_filter_start
vld1.64 {d0, d1}, [r0,:128], r1
bx lr
.endfunc
-function x264_deblock_h_luma_neon, export=1
+function x264_deblock_h_luma_neon
h264_loop_filter_start
sub r0, r0, #4
vqmovun.s16 d0, q11
.endm
-function x264_deblock_v_chroma_neon, export=1
+function x264_deblock_v_chroma_neon
h264_loop_filter_start
sub r0, r0, r1, lsl #1
bx lr
.endfunc
-function x264_deblock_h_chroma_neon, export=1
+function x264_deblock_h_chroma_neon
h264_loop_filter_start
sub r0, r0, #2
// They also use nothing above armv5te, but we don't care about pre-armv6
// void prefetch_ref( uint8_t *pix, int stride, int parity )
-function x264_prefetch_ref_arm, export=1
+function x264_prefetch_ref_arm
sub r2, r2, #1
add r0, r0, #64
and r2, r2, r1
// void prefetch_fenc( uint8_t *pix_y, int stride_y,
// uint8_t *pix_uv, int stride_uv, int mb_x )
-function x264_prefetch_fenc_arm, export=1
+function x264_prefetch_fenc_arm
ldr ip, [sp]
push {lr}
and lr, ip, #3
// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
-function x264_memcpy_aligned_neon, export=1
+function x264_memcpy_aligned_neon
orr r3, r0, r1, lsr #1
movrel ip, memcpy_table
and r3, r3, #0xc
.ltorg
// void x264_memzero_aligned( void *dst, size_t n )
-function x264_memzero_aligned_neon, export=1
+function x264_memzero_aligned_neon
vmov.i8 q0, #0
vmov.i8 q1, #0
memzero_loop:
// uint8_t *src1, int src1_stride,
// uint8_t *src2, int src2_stride, int weight );
.macro AVGH w h
-function x264_pixel_avg_\w\()x\h\()_neon, export=1
+function x264_pixel_avg_\w\()x\h\()_neon
ldr ip, [sp, #8]
push {r4-r6,lr}
cmp ip, #32
.endm
.macro AVG_WEIGHT ext
-function x264_pixel_avg_weight_w4_\ext\()_neon, export=1
+function x264_pixel_avg_weight_w4_\ext\()_neon
load_weights_\ext
1: // height loop
subs lr, lr, #2
pop {r4-r6,pc}
.endfunc
-function x264_pixel_avg_weight_w8_\ext\()_neon, export=1
+function x264_pixel_avg_weight_w8_\ext\()_neon
load_weights_\ext
1: // height loop
subs lr, lr, #4
pop {r4-r6,pc}
.endfunc
-function x264_pixel_avg_weight_w16_\ext\()_neon, export=1
+function x264_pixel_avg_weight_w16_\ext\()_neon
load_weights_\ext
1: // height loop
subs lr, lr, #2
AVG_WEIGHT add_sub
AVG_WEIGHT sub_add
-function x264_pixel_avg_w4_neon, export=1
+function x264_pixel_avg_w4_neon
subs lr, lr, #2
vld1.32 {d0[]}, [r2], r3
vld1.32 {d2[]}, [r4], r5
pop {r4-r6,pc}
.endfunc
-function x264_pixel_avg_w8_neon, export=1
+function x264_pixel_avg_w8_neon
subs lr, lr, #4
vld1.64 {d0}, [r2], r3
vld1.64 {d2}, [r4], r5
pop {r4-r6,pc}
.endfunc
-function x264_pixel_avg_w16_neon, export=1
+function x264_pixel_avg_w16_neon
subs lr, lr, #4
vld1.64 {d0-d1}, [r2], r3
vld1.64 {d2-d3}, [r4], r5
.endfunc
-function x264_pixel_avg2_w4_neon, export=1
+function x264_pixel_avg2_w4_neon
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
pop {pc}
.endfunc
-function x264_pixel_avg2_w8_neon, export=1
+function x264_pixel_avg2_w8_neon
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
pop {pc}
.endfunc
-function x264_pixel_avg2_w16_neon, export=1
+function x264_pixel_avg2_w16_neon
ldr ip, [sp, #4]
push {lr}
ldr lr, [sp, #4]
pop {pc}
.endfunc
-function x264_pixel_avg2_w20_neon, export=1
+function x264_pixel_avg2_w20_neon
ldr ip, [sp, #4]
push {lr}
sub r1, r1, #16
.endfunc
-// void mc_copy( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int height )
-function x264_mc_copy_w4_neon, export=1
+// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
+function x264_mc_copy_w4_neon
ldr ip, [sp]
copy_w4_loop:
subs ip, ip, #4
bx lr
.endfunc
-function x264_mc_copy_w8_neon, export=1
+function x264_mc_copy_w8_neon
ldr ip, [sp]
copy_w8_loop:
subs ip, ip, #4
bx lr
.endfunc
-function x264_mc_copy_w16_neon, export=1
+function x264_mc_copy_w16_neon
ldr ip, [sp]
copy_w16_loop:
subs ip, ip, #4
bx lr
.endfunc
-function x264_mc_copy_w16_aligned_neon, export=1
+function x264_mc_copy_w16_aligned_neon
ldr ip, [sp]
copy_w16_aligned_loop:
subs ip, ip, #4
// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
// uint8_t *src, int i_src_stride,
// int dx, int dy, int i_width, int i_height );
-function x264_mc_chroma_neon, export=1
+function x264_mc_chroma_neon
push {r4-r6, lr}
ldrd r4, [sp, #16]
ldr r6, [sp, #24]
// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
-function x264_hpel_filter_v_neon, export=1
+function x264_hpel_filter_v_neon
ldr ip, [sp]
sub r1, r1, r3, lsl #1
push {lr}
.endfunc
// hpel_filter_c( uint8_t *dst, int16_t *buf, int width );
-function x264_hpel_filter_c_neon, export=1
+function x264_hpel_filter_c_neon
sub r1, #16
vld1.64 {d0-d3}, [r1,:128]!
.endfunc
// hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
-function x264_hpel_filter_h_neon, export=1
+function x264_hpel_filter_h_neon
sub r1, #16
vmov.u8 d30, #5
vld1.64 {d0-d3}, [r1,:128]!
// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
// uint8_t *dstc, int src_stride, int dst_stride, int width,
// int height )
-function x264_frame_init_lowres_core_neon, export=1
+function x264_frame_init_lowres_core_neon
push {r4-r10,lr}
vpush {d8-d15}
ldrd r4, [sp, #96]
void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
-void x264_memzero_aligned_neon( void *dst, size_t n );
+void x264_memzero_aligned_neon( void *dst, int n );
void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
.text
.macro SAD4_ARMV6 h
-function x264_pixel_sad_4x\h\()_armv6, export=1
+function x264_pixel_sad_4x\h\()_armv6
push {r4-r6,lr}
ldr r4, [r2], r3
ldr r5, [r0], r1
.endm
.macro SAD_FUNC w, h, name, align:vararg
-function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
+function x264_pixel_sad\name\()_\w\()x\h\()_neon
.if \w == 16
.set r, \h / 2 - 1
.else
.endm
.macro SAD_FUNC_DUAL w, h
-function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual, export=1
+function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual
SAD_DUAL_START_\w
.rept \h / 2 - \w / 8
SAD_DUAL_\w
.endm
.macro SAD_X_FUNC x, w, h
-function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
+function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
push {r6-r7,lr}
.if \x == 3
ldrd r6, [sp, #12]
.endm
.macro SSD_FUNC w h
-function x264_pixel_ssd_\w\()x\h\()_neon, export=1
+function x264_pixel_ssd_\w\()x\h\()_neon
SSD_START_\w
.rept \h-2
SSD_\w
\vpadal \qsqr_sum, \qsqr_last
.endm
-function x264_pixel_var_8x8_neon, export=1
+function x264_pixel_var_8x8_neon
vld1.64 {d16}, [r0,:64], r1
vmull.u8 q1, d16, d16
vmovl.u8 q0, d16
b x264_var_end
.endfunc
-function x264_pixel_var_16x16_neon, export=1
+function x264_pixel_var_16x16_neon
vld1.64 {d16-d17}, [r0,:128], r1
vmull.u8 q12, d16, d16
vmovl.u8 q0, d16
vmlal.s16 \acc, \d1, \d1
.endm
-function x264_pixel_var2_8x8_neon, export=1
+function x264_pixel_var2_8x8_neon
DIFF_SUM q0, d0, d1
DIFF_SUM q8, d16, d17
SQR_ACC q1, d0, d1, vmull.s16
vsubl.u8 \q3, d6, d7
.endm
-function x264_pixel_satd_4x4_neon, export=1
+function x264_pixel_satd_4x4_neon
vld1.32 {d1[]}, [r2], r3
vld1.32 {d0[]}, [r0,:32], r1
vld1.32 {d3[]}, [r2], r3
bx lr
.endfunc
-function x264_pixel_satd_4x8_neon, export=1
+function x264_pixel_satd_4x8_neon
vld1.32 {d1[]}, [r2], r3
vld1.32 {d0[]}, [r0,:32], r1
vld1.32 {d3[]}, [r2], r3
b x264_satd_4x8_8x4_end_neon
.endfunc
-function x264_pixel_satd_8x4_neon, export=1
+function x264_pixel_satd_8x4_neon
vld1.64 {d1}, [r2], r3
vld1.64 {d0}, [r0,:64], r1
vsubl.u8 q0, d0, d1
bx lr
.endfunc
-function x264_pixel_satd_8x8_neon, export=1
+function x264_pixel_satd_8x8_neon
mov ip, lr
bl x264_satd_8x8_neon
bx lr
.endfunc
-function x264_pixel_satd_8x16_neon, export=1
+function x264_pixel_satd_8x16_neon
vpush {d8-d11}
mov ip, lr
bx lr
.endfunc
-function x264_pixel_satd_16x8_neon, export=1
+function x264_pixel_satd_16x8_neon
vpush {d8-d11}
mov ip, lr
bx lr
.endfunc
-function x264_pixel_satd_16x16_neon, export=1
+function x264_pixel_satd_16x16_neon
vpush {d8-d11}
mov ip, lr
.endfunc
-function x264_pixel_sa8d_8x8_neon, export=1
+function x264_pixel_sa8d_8x8_neon
mov ip, lr
bl x264_sa8d_8x8_neon
vadd.u16 q0, q8, q9
bx lr
.endfunc
-function x264_pixel_sa8d_16x16_neon, export=1
+function x264_pixel_sa8d_16x16_neon
vpush {d8-d11}
mov ip, lr
bl x264_sa8d_8x8_neon
.macro HADAMARD_AC w h
-function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
+function x264_pixel_hadamard_ac_\w\()x\h\()_neon
vpush {d8-d15}
movrel ip, mask_ac4
vmov.i8 q4, #0
vmull.u8 \ssb, \db, \db
.endm
-function x264_pixel_ssim_4x4x2_core_neon, export=1
+function x264_pixel_ssim_4x4x2_core_neon
ldr ip, [sp]
vld1.64 {d0}, [r0], r1
vld1.64 {d2}, [r2], r3
.endfunc
// FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2
-function x264_pixel_ssim_end4_neon, export=1
+function x264_pixel_ssim_end4_neon
vld1.32 {d16-d19}, [r0,:128]!
vld1.32 {d20-d23}, [r1,:128]!
vadd.s32 q0, q8, q10
.text
// because gcc doesn't believe in using the free shift in add
-function x264_predict_4x4_h_armv6, export=1
+function x264_predict_4x4_h_armv6
ldrb r1, [r0, #0*FDEC_STRIDE-1]
ldrb r2, [r0, #1*FDEC_STRIDE-1]
ldrb r3, [r0, #2*FDEC_STRIDE-1]
bx lr
.endfunc
-function x264_predict_4x4_dc_armv6, export=1
+function x264_predict_4x4_dc_armv6
mov ip, #0
ldr r1, [r0, #-FDEC_STRIDE]
ldrb r2, [r0, #0*FDEC_STRIDE-1]
uadd8 \a2, \a2, \c2
.endm
-function x264_predict_4x4_ddr_armv6, export=1
+function x264_predict_4x4_ddr_armv6
ldr r1, [r0, # -FDEC_STRIDE]
ldrb r2, [r0, # -FDEC_STRIDE-1]
ldrb r3, [r0, #0*FDEC_STRIDE-1]
pop {r4-r6,pc}
.endfunc
-function x264_predict_4x4_ddl_neon, export=1
+function x264_predict_4x4_ddl_neon
sub r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0], ip
bx lr
.endfunc
-function x264_predict_8x8_dc_neon, export=1
+function x264_predict_8x8_dc_neon
mov ip, #0
ldrd r2, [r1, #8]
push {r4-r5,lr}
.endfunc
-function x264_predict_8x8_h_neon, export=1
+function x264_predict_8x8_h_neon
add r1, r1, #7
mov ip, #FDEC_STRIDE
vld1.64 {d16}, [r1]
bx lr
.endfunc
-function x264_predict_8x8c_h_neon, export=1
+function x264_predict_8x8c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 4
bx lr
.endfunc
-function x264_predict_8x8c_v_neon, export=1
+function x264_predict_8x8c_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0}, [r0,:64], ip
.endfunc
-function x264_predict_16x16_dc_neon, export=1
+function x264_predict_16x16_dc_neon
sub r3, r0, #FDEC_STRIDE
sub r0, r0, #1
vld1.64 {d0-d1}, [r3,:128]
bx lr
.endfunc
-function x264_predict_16x16_h_neon, export=1
+function x264_predict_16x16_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
bx lr
.endfunc
-function x264_predict_16x16_v_neon, export=1
+function x264_predict_16x16_v_neon
sub r0, r0, #FDEC_STRIDE
mov ip, #FDEC_STRIDE
vld1.64 {d0-d1}, [r0,:128], ip
.endm
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
-function x264_quant_2x2_dc_neon, export=1
+function x264_quant_2x2_dc_neon
vld1.64 {d0}, [r0,:64]
vabs.s16 d3, d0
vdup.16 d2, r2
.endfunc
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
-function x264_quant_4x4_dc_neon, export=1
+function x264_quant_4x4_dc_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
.endfunc
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
-function x264_quant_4x4_neon, export=1
+function x264_quant_4x4_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
.endfunc
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
-function x264_quant_8x8_neon, export=1
+function x264_quant_8x8_neon
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
vabs.s16 q9, q15
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
.macro DEQUANT size bits
-function x264_dequant_\size\()_neon, export=1
+function x264_dequant_\size\()_neon
DEQUANT_START \bits+2, \bits
.ifc \size, 8x8
mov r2, #4
DEQUANT 8x8, 6
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
-function x264_dequant_4x4_dc_neon, export=1
+function x264_dequant_4x4_dc_neon
DEQUANT_START 6, 6, yes
blt dequant_4x4_dc_rshift
// int coeff_last( int16_t *l )
-function x264_coeff_last4_arm, export=1
+function x264_coeff_last4_arm
ldrd r2, [r0]
subs r0, r3, #0
movne r0, #2
.endfunc
.macro COEFF_LAST_1x size
-function x264_coeff_last\size\()_neon, export=1
+function x264_coeff_last\size\()_neon
.if \size == 15
sub r0, r0, #2
vld1.64 {d0-d3}, [r0]
COEFF_LAST_1x 15
COEFF_LAST_1x 16
-function x264_coeff_last64_neon, export=1
+function x264_coeff_last64_neon
vld1.64 {d16-d19}, [r0,:128]!
vqmovn.u16 d16, q8
vqmovn.u16 d17, q9