From: Janne Grunau Date: Sun, 20 Jul 2014 11:32:10 +0000 (+0200) Subject: arm: check if the assembler supports the '.func' directive X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=d72760401cb0602b8bf86037988e66cdc810681c;p=x264 arm: check if the assembler supports the '.func' directive The integrated assembler in llvm trunk (to be released as 3.5) is otherwise capable enough to assemble the arm asm correctly. --- diff --git a/common/arm/asm.S b/common/arm/asm.S index 66568589..3fb11b86 100644 --- a/common/arm/asm.S +++ b/common/arm/asm.S @@ -50,6 +50,12 @@ # define ELF @ #endif +#if HAVE_AS_FUNC +# define FUNC +#else +# define FUNC @ +#endif + .macro require8, val=1 ELF .eabi_attribute 24, \val .endm @@ -59,17 +65,22 @@ ELF .eabi_attribute 25, \val .endm .macro function name, export=1 + .macro endfunc +ELF .size \name, . - \name +FUNC .endfunc + .purgem endfunc + .endm .align 2 .if \export == 1 .global EXTERN_ASM\name ELF .hidden EXTERN_ASM\name ELF .type EXTERN_ASM\name, %function - .func EXTERN_ASM\name +FUNC .func EXTERN_ASM\name EXTERN_ASM\name: .else ELF .hidden \name ELF .type \name, %function - .func \name +FUNC .func \name \name: .endif .endm diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S index 7fc273a1..9285219d 100644 --- a/common/arm/cpu-a.S +++ b/common/arm/cpu-a.S @@ -32,7 +32,7 @@ function x264_cpu_neon_test vadd.i16 q0, q0, q0 bx lr -.endfunc +endfunc // return: 0 on success // 1 if counters were already enabled @@ -48,14 +48,14 @@ function x264_cpu_enable_armv7_counter, export=0 mov r2, #1 << 31 // enable cycle counter mcr p15, 0, r2, c9, c12, 1 // write CNTENS bx lr -.endfunc +endfunc function x264_cpu_disable_armv7_counter, export=0 mrc p15, 0, r0, c9, c12, 0 // read PMNC bic r0, r0, #1 // disable counters mcr p15, 0, r0, c9, c12, 0 // write PMNC bx lr -.endfunc +endfunc .macro READ_TIME r @@ -105,4 +105,4 @@ average_loop: cmp r0, #10 movgt r0, #0 pop {r4-r6,pc} -.endfunc +endfunc diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S index 9e7d75f1..f8d1ccfd 100644 --- a/common/arm/dct-a.S +++ b/common/arm/dct-a.S @@ -80,7 +80,7 @@ function x264_dct4x4dc_neon vrhadd.s16 d3, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc function x264_idct4x4dc_neon vld1.64 {d0-d3}, [r0,:128] @@ -92,7 +92,7 @@ function x264_idct4x4dc_neon HADAMARD 2, sumsub, d3, d2, d6, d7 vst1.64 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc .macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7 @@ -127,7 +127,7 @@ function x264_sub4x4_dct_neon DCT_1D d4, d5, d6, d7, d0, d1, d2, d3 vst1.64 {d4-d7}, [r0,:128] bx lr -.endfunc +endfunc function x264_sub8x4_dct_neon, export=0 vld1.64 {d0}, [r1,:64], r3 @@ -163,7 +163,7 @@ function x264_sub8x4_dct_neon, export=0 vst1.64 {d4-d5}, [r0,:128]! vst1.64 {d6-d7}, [r0,:128]! bx lr -.endfunc +endfunc function x264_sub8x8_dct_neon push {lr} @@ -172,7 +172,7 @@ function x264_sub8x8_dct_neon bl x264_sub8x4_dct_neon pop {lr} b x264_sub8x4_dct_neon -.endfunc +endfunc function x264_sub16x16_dct_neon push {lr} @@ -193,7 +193,7 @@ function x264_sub16x16_dct_neon bl x264_sub8x4_dct_neon pop {lr} b x264_sub8x4_dct_neon -.endfunc +endfunc .macro DCT8_1D type @@ -277,7 +277,7 @@ function x264_sub8x8_dct8_neon vst1.64 {d24-d27}, [r0,:128]! vst1.64 {d28-d31}, [r0,:128]! bx lr -.endfunc +endfunc function x264_sub16x16_dct8_neon push {lr} @@ -292,7 +292,7 @@ function x264_sub16x16_dct8_neon sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 b X(x264_sub8x8_dct8_neon) -.endfunc +endfunc // First part of IDCT (minus final SUMSUB_BA) @@ -334,7 +334,7 @@ function x264_add4x4_idct_neon vst1.32 {d2[1]}, [r0,:32], r2 vst1.32 {d2[0]}, [r0,:32], r2 bx lr -.endfunc +endfunc function x264_add8x4_idct_neon, export=0 vld1.64 {d0-d3}, [r1,:128]! @@ -374,7 +374,7 @@ function x264_add8x4_idct_neon, export=0 vst1.32 {d2}, [r0,:64], r2 vst1.32 {d3}, [r0,:64], r2 bx lr -.endfunc +endfunc function x264_add8x8_idct_neon mov r2, #FDEC_STRIDE @@ -382,7 +382,7 @@ function x264_add8x8_idct_neon bl x264_add8x4_idct_neon mov lr, ip b x264_add8x4_idct_neon -.endfunc +endfunc function x264_add16x16_idct_neon mov r2, #FDEC_STRIDE @@ -399,7 +399,7 @@ function x264_add16x16_idct_neon bl x264_add8x4_idct_neon mov lr, ip b x264_add8x4_idct_neon -.endfunc +endfunc .macro IDCT8_1D type @@ -496,7 +496,7 @@ function x264_add8x8_idct8_neon vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr -.endfunc +endfunc function x264_add16x16_idct8_neon mov ip, lr @@ -508,7 +508,7 @@ function x264_add16x16_idct8_neon sub r0, r0, #8*FDEC_STRIDE-8 mov lr, ip b X(x264_add8x8_idct8_neon) -.endfunc +endfunc function x264_add8x8_idct_dc_neon @@ -560,7 +560,7 @@ function x264_add8x8_idct_dc_neon vst1.64 {d6}, [r0,:64], r2 vst1.64 {d7}, [r0,:64], r2 bx lr -.endfunc +endfunc .macro ADD16x4_IDCT_DC dc vld1.64 {d16-d17}, [r0,:128], r3 @@ -608,7 +608,7 @@ function x264_add16x16_idct_dc_neon ADD16x4_IDCT_DC d2 ADD16x4_IDCT_DC d3 bx lr -.endfunc +endfunc function x264_sub8x8_dct_dc_neon mov r3, #FENC_STRIDE @@ -656,7 +656,7 @@ function x264_sub8x8_dct_dc_neon vpadd.s16 d0, d0, d1 vst1.64 {d0}, [r0,:64] bx lr -.endfunc +endfunc function x264_zigzag_scan_4x4_frame_neon @@ -669,4 +669,4 @@ function x264_zigzag_scan_4x4_frame_neon vtbl.8 d7, {d2-d3}, d19 vst1.64 {d4-d7}, [r0,:128] bx lr -.endfunc +endfunc diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 2d0ef9ff..59977b4b 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -140,7 +140,7 @@ function x264_deblock_v_luma_neon align_pop_regs bx lr -.endfunc +endfunc function x264_deblock_h_luma_neon h264_loop_filter_start @@ -192,7 +192,7 @@ function x264_deblock_h_luma_neon align_pop_regs bx lr -.endfunc +endfunc .macro h264_loop_filter_chroma vdup.8 q11, r2 // alpha @@ -253,7 +253,7 @@ function x264_deblock_v_chroma_neon vst2.8 {d0, d1}, [r0,:128], r1 bx lr -.endfunc +endfunc function x264_deblock_h_chroma_neon h264_loop_filter_start @@ -301,7 +301,7 @@ function x264_deblock_h_chroma_neon vst1.8 {d3}, [r0], r1 bx lr -.endfunc +endfunc function x264_deblock_strength_neon ldr ip, [sp] @@ -407,4 +407,4 @@ lists: vst1.8 {q8}, [r3,:128] @ bs[0] bx lr -.endfunc +endfunc diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index 3a16d0db..cd57920c 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -49,7 +49,7 @@ function x264_prefetch_ref_arm pld [r3, r1, lsl #1] pld [r3, r2] bx lr -.endfunc +endfunc // void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y, // uint8_t *pix_uv, intptr_t stride_uv, int mb_x ) @@ -75,7 +75,7 @@ function x264_prefetch_fenc_arm pld [ip] pld [ip, r3] pop {pc} -.endfunc +endfunc // void *x264_memcpy_aligned( void *dst, const void *src, size_t n ) @@ -84,7 +84,7 @@ function x264_memcpy_aligned_neon movrel ip, memcpy_table and r3, r3, #0xc ldr pc, [ip, r3] -.endfunc +endfunc .macro MEMCPY_ALIGNED srcalign dstalign function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0 @@ -126,7 +126,7 @@ function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0 vst1.64 {d0}, [r3,:64]! .endif bx lr -.endfunc +endfunc .endm MEMCPY_ALIGNED 16, 16 @@ -155,7 +155,7 @@ memzero_loop: .endr bgt memzero_loop bx lr -.endfunc +endfunc // void pixel_avg( uint8_t *dst, intptr_t dst_stride, @@ -174,7 +174,7 @@ function x264_pixel_avg_\w\()x\h\()_neon cmp ip, #0 bge x264_pixel_avg_weight_w\w\()_add_add_neon b x264_pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 -.endfunc +endfunc .endm AVGH 4, 2 @@ -252,7 +252,7 @@ function x264_pixel_avg_weight_w4_\ext\()_neon, export=0 vst1.32 {d1[0]}, [r0,:32], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 load_weights_\ext @@ -276,7 +276,7 @@ function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 vst1.64 {d3}, [r0,:64], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg_weight_w16_\ext\()_neon, export=0 load_weights_\ext @@ -296,7 +296,7 @@ function x264_pixel_avg_weight_w16_\ext\()_neon, export=0 vst1.64 {d2-d3}, [r0,:128], r1 bgt 1b pop {r4-r6,pc} -.endfunc +endfunc .endm AVG_WEIGHT add_add @@ -315,7 +315,7 @@ function x264_pixel_avg_w4_neon, export=0 vst1.32 {d1[0]}, [r0,:32], r1 bgt x264_pixel_avg_w4_neon pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg_w8_neon, export=0 subs lr, lr, #4 @@ -337,7 +337,7 @@ function x264_pixel_avg_w8_neon, export=0 vst1.64 {d3}, [r0,:64], r1 bgt x264_pixel_avg_w8_neon pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg_w16_neon, export=0 subs lr, lr, #4 @@ -359,7 +359,7 @@ function x264_pixel_avg_w16_neon, export=0 vst1.64 {d6-d7}, [r0,:128], r1 bgt x264_pixel_avg_w16_neon pop {r4-r6,pc} -.endfunc +endfunc function x264_pixel_avg2_w4_neon @@ -378,7 +378,7 @@ avg2_w4_loop: vst1.32 {d1[0]}, [r0,:32], r1 bgt avg2_w4_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w8_neon ldr ip, [sp, #4] @@ -396,7 +396,7 @@ avg2_w8_loop: vst1.64 {d1}, [r0,:64], r1 bgt avg2_w8_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w16_neon ldr ip, [sp, #4] @@ -414,7 +414,7 @@ avg2_w16_loop: vst1.64 {d4-d5}, [r0,:128], r1 bgt avg2_w16_loop pop {pc} -.endfunc +endfunc function x264_pixel_avg2_w20_neon ldr ip, [sp, #4] @@ -437,7 +437,7 @@ avg2_w20_loop: vst1.32 {d6[0]}, [r0,:32], r1 bgt avg2_w20_loop pop {pc} -.endfunc +endfunc .macro weight_prologue type @@ -498,7 +498,7 @@ weight20_loop: vst1.32 {d20[1]}, [r0,:32], r1 bgt weight20_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w16_neon weight_prologue full @@ -530,7 +530,7 @@ weight16_loop: vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w8_neon weight_prologue full @@ -552,7 +552,7 @@ weight8_loop: vst1.8 {d18}, [r0,:64], r1 bgt weight8_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w4_neon weight_prologue full @@ -571,7 +571,7 @@ weight4_loop: vst1.32 {d16[1]}, [r0,:32], r1 bgt weight4_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w20_nodenom_neon weight_prologue nodenom @@ -608,7 +608,7 @@ weight20_nodenom_loop: vst1.32 {d20[1]}, [r0,:32], r1 bgt weight20_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w16_nodenom_neon weight_prologue nodenom @@ -636,7 +636,7 @@ weight16_nodenom_loop: vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w8_nodenom_neon weight_prologue nodenom @@ -656,7 +656,7 @@ weight8_nodenom_loop: vst1.8 {d17}, [r0,:64], r1 bgt weight8_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc function x264_mc_weight_w4_nodenom_neon weight_prologue nodenom @@ -674,7 +674,7 @@ weight4_nodenom_loop: vst1.32 {d16[1]}, [r0,:32], r1 bgt weight4_nodenom_loop pop {r4-r5,pc} -.endfunc +endfunc .macro weight_simple_prologue push {lr} @@ -698,7 +698,7 @@ weight20_\name\()_loop: vst1.8 {d19-d21}, [r0,:64], r1 bgt weight20_\name\()_loop pop {pc} -.endfunc +endfunc function x264_mc_weight_w16_\name\()_neon weight_simple_prologue @@ -712,7 +712,7 @@ weight16_\name\()_loop: vst1.8 {d18-d19}, [r0,:128], r1 bgt weight16_\name\()_loop pop {pc} -.endfunc +endfunc function x264_mc_weight_w8_\name\()_neon weight_simple_prologue @@ -725,7 +725,7 @@ weight8_\name\()_loop: vst1.8 {d17}, [r0,:64], r1 bgt weight8_\name\()_loop pop {pc} -.endfunc +endfunc function x264_mc_weight_w4_\name\()_neon weight_simple_prologue @@ -738,7 +738,7 @@ weight4_\name\()_loop: vst1.32 {d17[0]}, [r0,:32], r1 bgt weight4_\name\()_loop pop {pc} -.endfunc +endfunc .endm weight_simple offsetadd, vqadd.u8 @@ -760,7 +760,7 @@ copy_w4_loop: vst1.32 {d3[0]}, [r0,:32], r1 bgt copy_w4_loop bx lr -.endfunc +endfunc function x264_mc_copy_w8_neon ldr ip, [sp] @@ -776,7 +776,7 @@ copy_w8_loop: vst1.32 {d3}, [r0,:64], r1 bgt copy_w8_loop bx lr -.endfunc +endfunc function x264_mc_copy_w16_neon ldr ip, [sp] @@ -792,7 +792,7 @@ copy_w16_loop: vst1.32 {d6-d7}, [r0,:128], r1 bgt copy_w16_loop bx lr -.endfunc +endfunc function x264_mc_copy_w16_aligned_neon ldr ip, [sp] @@ -808,7 +808,7 @@ copy_w16_aligned_loop: vst1.32 {d6-d7}, [r0,:128], r1 bgt copy_w16_aligned_loop bx lr -.endfunc +endfunc // void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride, @@ -1158,7 +1158,7 @@ mc_chroma_w8: vpop {d8-d11} pop {r4-r8, pc} -.endfunc +endfunc // hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width ) @@ -1199,7 +1199,7 @@ filter_v_loop: vst1.64 {d0-d1}, [r0,:128]! bgt filter_v_loop pop {pc} -.endfunc +endfunc // hpel_filter_c( uint8_t *dst, int16_t *buf, int width ); function x264_hpel_filter_c_neon @@ -1284,7 +1284,7 @@ filter_c_loop: vst1.64 {d30-d31}, [r0,:128]! bgt filter_c_loop bx lr -.endfunc +endfunc // hpel_filter_h( uint8_t *dst, uint8_t *src, int width ); function x264_hpel_filter_h_neon @@ -1371,7 +1371,7 @@ filter_h_loop: vst1.64 {d6-d7}, [r0,:128]! bgt filter_h_loop bx lr -.endfunc +endfunc // frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, @@ -1463,7 +1463,7 @@ lowres_xloop_end: vpop {d8-d15} pop {r4-r10,pc} -.endfunc +endfunc function x264_load_deinterleave_chroma_fdec_neon mov ip, #FDEC_STRIDE/2 @@ -1476,7 +1476,7 @@ function x264_load_deinterleave_chroma_fdec_neon bgt 1b bx lr -.endfunc +endfunc function x264_load_deinterleave_chroma_fenc_neon mov ip, #FENC_STRIDE/2 @@ -1489,7 +1489,7 @@ function x264_load_deinterleave_chroma_fenc_neon bgt 1b bx lr -.endfunc +endfunc function x264_plane_copy_deinterleave_neon push {r4-r7, lr} @@ -1515,7 +1515,7 @@ block: bgt block pop {r4-r7, pc} -.endfunc +endfunc function x264_plane_copy_deinterleave_rgb_neon push {r4-r8, r10, r11, lr} @@ -1567,7 +1567,7 @@ block4: bgt block4 pop {r4-r8, r10, r11, pc} -.endfunc +endfunc function x264_plane_copy_interleave_neon push {r4-r7, lr} @@ -1594,7 +1594,7 @@ blocki: bgt blocki pop {r4-r7, pc} -.endfunc +endfunc function x264_store_interleave_chroma_neon push {lr} @@ -1608,4 +1608,4 @@ function x264_store_interleave_chroma_neon bgt 1b pop {pc} -.endfunc +endfunc diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index e288bcf6..80b8b700 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -61,7 +61,7 @@ function x264_pixel_sad_4x\h\()_armv6 .endr usada8 r0, r6, lr, ip pop {r4-r6,pc} -.endfunc +endfunc .endm SAD4_ARMV6 4 @@ -138,7 +138,7 @@ function x264_pixel_sad\name\()_\w\()x\h\()_neon vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr -.endfunc +endfunc .endm SAD_FUNC 4, 4 @@ -223,7 +223,7 @@ function x264_pixel_sad_aligned_\w\()x\h\()_neon_dual vpaddl.u16 d0, d0 vmov.u32 r0, d0[0] bx lr -.endfunc +endfunc .endm SAD_FUNC_DUAL 8, 4 @@ -369,7 +369,7 @@ function x264_pixel_sad_x\x\()_\w\()x\h\()_neon vst1.32 {d0-d1}, [r7] .endif pop {r6-r7,pc} -.endfunc +endfunc .endm SAD_X_FUNC 3, 4, 4 @@ -478,7 +478,7 @@ function x264_pixel_ssd_\w\()x\h\()_neon vpadd.s32 d0, d0, d0 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc .endm SSD_FUNC 4, 4 @@ -518,7 +518,7 @@ function x264_pixel_var_8x8_neon vld1.64 {d26}, [r0,:64], r1 VAR_SQR_SUM q2, q10, q15, d26 b x264_var_end -.endfunc +endfunc function x264_pixel_var_8x16_neon vld1.64 {d16}, [r0,:64], r1 @@ -550,7 +550,7 @@ function x264_pixel_var_8x16_neon 2: VAR_SQR_SUM q2, q13, q15, d22 b x264_var_end -.endfunc +endfunc function x264_pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 @@ -574,7 +574,7 @@ var16_loop: VAR_SQR_SUM q1, q12, q14, d18 VAR_SQR_SUM q2, q13, q15, d19 bgt var16_loop -.endfunc +endfunc function x264_var_end, export=0 vpaddl.u16 q8, q14 @@ -589,7 +589,7 @@ function x264_var_end, export=0 vmov r0, r1, d0 bx lr -.endfunc +endfunc .macro DIFF_SUM diff da db lastdiff vld1.64 {\da}, [r0,:64], r1 @@ -634,7 +634,7 @@ function x264_pixel_var2_8x8_neon mul r0, r0, r0 sub r0, r1, r0, lsr #6 bx lr -.endfunc +endfunc function x264_pixel_var2_8x16_neon vld1.64 {d16}, [r0,:64], r1 @@ -678,7 +678,7 @@ function x264_pixel_var2_8x16_neon mul r0, r0, r0 sub r0, r1, r0, lsr #7 bx lr -.endfunc +endfunc .macro LOAD_DIFF_8x4 q0 q1 q2 q3 vld1.32 {d1}, [r2], r3 @@ -715,7 +715,7 @@ function x264_pixel_satd_4x4_neon HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_4x8_neon vld1.32 {d1[]}, [r2], r3 @@ -742,7 +742,7 @@ function x264_pixel_satd_4x8_neon vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 b x264_satd_4x8_8x4_end_neon -.endfunc +endfunc function x264_pixel_satd_8x4_neon vld1.64 {d1}, [r2], r3 @@ -759,7 +759,7 @@ function x264_pixel_satd_8x4_neon vld1.64 {d6}, [r0,:64], r1 vsubl.u8 q3, d6, d7 SUMSUB_AB q10, q11, q2, q3 -.endfunc +endfunc function x264_satd_4x8_8x4_end_neon, export=0 vadd.s16 q0, q8, q10 @@ -786,7 +786,7 @@ function x264_satd_4x8_8x4_end_neon, export=0 HORIZ_ADD d0, d0, d1 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_8x8_neon mov ip, lr @@ -800,7 +800,7 @@ function x264_pixel_satd_8x8_neon mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_8x16_neon vpush {d8-d11} @@ -822,7 +822,7 @@ function x264_pixel_satd_8x16_neon mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_satd_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 @@ -842,7 +842,7 @@ function x264_satd_8x8_neon, export=0 SUMSUB_AB q9, q11, q1, q3 vld1.64 {d0}, [r0,:64], r1 vsubl.u8 q15, d0, d1 -.endfunc +endfunc // one vertical hadamard pass and two horizontal function x264_satd_8x4v_8x8h_neon, export=0 @@ -871,7 +871,7 @@ function x264_satd_8x4v_8x8h_neon, export=0 vmax.s16 q14, q8, q10 vmax.s16 q15, q9, q11 bx lr -.endfunc +endfunc function x264_pixel_satd_16x8_neon vpush {d8-d11} @@ -893,7 +893,7 @@ function x264_pixel_satd_16x8_neon mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_pixel_satd_16x16_neon vpush {d8-d11} @@ -927,7 +927,7 @@ function x264_pixel_satd_16x16_neon mov lr, ip vmov.32 r0, d0[0] bx lr -.endfunc +endfunc function x264_satd_16x4_neon, export=0 vld1.64 {d2-d3}, [r2], r3 @@ -951,7 +951,7 @@ function x264_satd_16x4_neon, export=0 SUMSUB_AB q2, q3, q10, q11 SUMSUB_ABCD q8, q10, q9, q11, q0, q2, q1, q3 b x264_satd_8x4v_8x8h_neon -.endfunc +endfunc function x264_pixel_sa8d_8x8_neon @@ -964,7 +964,7 @@ function x264_pixel_sa8d_8x8_neon add r0, r0, #1 lsr r0, r0, #1 bx lr -.endfunc +endfunc function x264_pixel_sa8d_16x16_neon vpush {d8-d11} @@ -996,7 +996,7 @@ function x264_pixel_sa8d_16x16_neon add r0, r0, #1 lsr r0, r0, #1 bx lr -.endfunc +endfunc .macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 @@ -1059,7 +1059,7 @@ function x264_sa8d_8x8_neon, export=0 vadd.i16 q8, q8, q9 vadd.i16 q9, q10, q11 bx lr -.endfunc +endfunc .macro HADAMARD_AC w h @@ -1095,7 +1095,7 @@ function x264_pixel_hadamard_ac_\w\()x\h\()_neon lsr r0, r0, #1 lsr r1, r1, #2 bx lr -.endfunc +endfunc .endm HADAMARD_AC 8, 8 @@ -1190,7 +1190,7 @@ function x264_hadamard_ac_8x8_neon, export=0 vadd.s16 q2, q2, q14 vpadal.u16 q5, q2 bx lr -.endfunc +endfunc .macro SSIM_ITER n ssa s12 ssb lastssa lasts12 lastssb da db dnext @@ -1244,7 +1244,7 @@ function x264_pixel_ssim_4x4x2_core_neon vst4.32 {d0-d3}, [ip] bx lr -.endfunc +endfunc // FIXME: see about doing 16x16 -> 32 bit multiplies for s1/s2 function x264_pixel_ssim_end4_neon @@ -1315,4 +1315,4 @@ ssim_skip: vpadd.f32 d0, d0, d0 vmov.32 r0, d0[0] bx lr -.endfunc +endfunc diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S index 8af861bd..593d4030 100644 --- a/common/arm/predict-a.S +++ b/common/arm/predict-a.S @@ -75,7 +75,7 @@ function x264_predict_4x4_h_armv6 add ip, ip, ip, lsl #16 str ip, [r0, #3*FDEC_STRIDE] bx lr -.endfunc +endfunc function x264_predict_4x4_v_armv6 ldr r1, [r0, #0 - 1 * FDEC_STRIDE] @@ -84,7 +84,7 @@ function x264_predict_4x4_v_armv6 str r1, [r0, #0 + 2 * FDEC_STRIDE] str r1, [r0, #0 + 3 * FDEC_STRIDE] bx lr -.endfunc +endfunc function x264_predict_4x4_dc_armv6 mov ip, #0 @@ -107,7 +107,7 @@ function x264_predict_4x4_dc_armv6 str r1, [r0, #2*FDEC_STRIDE] str r1, [r0, #3*FDEC_STRIDE] bx lr -.endfunc +endfunc function x264_predict_4x4_dc_top_neon mov r12, #FDEC_STRIDE @@ -122,7 +122,7 @@ function x264_predict_4x4_dc_top_neon vst1.32 d1[0], [r0,:32], r12 vst1.32 d1[0], [r0,:32], r12 bx lr -.endfunc +endfunc // return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2 .macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1 @@ -165,7 +165,7 @@ function x264_predict_4x4_ddr_armv6 add r5, r5, r4, lsr #8 str r5, [r0, #3*FDEC_STRIDE] pop {r4-r6,pc} -.endfunc +endfunc function x264_predict_4x4_ddl_neon sub r0, #FDEC_STRIDE @@ -184,7 +184,7 @@ function x264_predict_4x4_ddl_neon vst1.32 {d2[0]}, [r0,:32], ip vst1.32 {d3[0]}, [r0,:32], ip bx lr -.endfunc +endfunc function x264_predict_8x8_dc_neon mov ip, #0 @@ -208,7 +208,7 @@ function x264_predict_8x8_dc_neon vst1.64 {d0}, [r0,:64], ip .endr pop {r4-r5,pc} -.endfunc +endfunc function x264_predict_8x8_h_neon add r1, r1, #7 @@ -231,7 +231,7 @@ function x264_predict_8x8_h_neon vst1.64 {d6}, [r0,:64], ip vst1.64 {d7}, [r0,:64], ip bx lr -.endfunc +endfunc function x264_predict_8x8_v_neon add r1, r1, #16 @@ -241,7 +241,7 @@ function x264_predict_8x8_v_neon vst1.8 {d0}, [r0,:64], r12 .endr bx lr -.endfunc +endfunc function x264_predict_8x8_ddl_neon add r1, #16 @@ -269,7 +269,7 @@ function x264_predict_8x8_ddl_neon vst1.8 d2, [r0,:64], r12 vst1.8 d1, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_ddr_neon vld1.8 {d0-d3}, [r1,:128] @@ -299,7 +299,7 @@ function x264_predict_8x8_ddr_neon vst1.8 {d4}, [r0,:64], r12 vst1.8 {d5}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_vl_neon add r1, #16 @@ -330,7 +330,7 @@ function x264_predict_8x8_vl_neon vst1.8 {d3}, [r0,:64], r12 vst1.8 {d2}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_vr_neon add r1, #8 @@ -362,7 +362,7 @@ function x264_predict_8x8_vr_neon vst1.8 {d6}, [r0,:64], r12 vst1.8 {d3}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_hd_neon mov r12, #FDEC_STRIDE @@ -395,7 +395,7 @@ function x264_predict_8x8_hd_neon vst1.8 {d16}, [r0,:64], r12 bx lr -.endfunc +endfunc function x264_predict_8x8_hu_neon mov r12, #FDEC_STRIDE @@ -428,7 +428,7 @@ function x264_predict_8x8_hu_neon vst1.8 {d7}, [r0,:64], r12 vst1.8 {d17}, [r0,:64] bx lr -.endfunc +endfunc function x264_predict_8x8c_dc_top_neon sub r2, r0, #FDEC_STRIDE @@ -441,7 +441,7 @@ function x264_predict_8x8c_dc_top_neon vdup.8 d0, d0[0] vtrn.32 d0, d1 b pred8x8_dc_end -.endfunc +endfunc function x264_predict_8x8c_dc_left_neon mov r1, #FDEC_STRIDE @@ -453,7 +453,7 @@ function x264_predict_8x8c_dc_left_neon vdup.8 d1, d0[1] vdup.8 d0, d0[0] b pred8x8_dc_end -.endfunc +endfunc function x264_predict_8x8c_dc_neon sub r2, r0, #FDEC_STRIDE @@ -479,7 +479,7 @@ pred8x8_dc_end: vst1.8 {d1}, [r2,:64], r1 .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_h_neon sub r1, r0, #1 @@ -491,7 +491,7 @@ function x264_predict_8x8c_h_neon vst1.64 {d2}, [r0,:64], ip .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_v_neon sub r0, r0, #FDEC_STRIDE @@ -501,7 +501,7 @@ function x264_predict_8x8c_v_neon vst1.64 {d0}, [r0,:64], ip .endr bx lr -.endfunc +endfunc function x264_predict_8x8c_p_neon sub r3, r0, #FDEC_STRIDE @@ -554,7 +554,7 @@ function x264_predict_8x8c_p_neon subs r3, r3, #1 bne 1b bx lr -.endfunc +endfunc function x264_predict_16x16_dc_top_neon @@ -565,7 +565,7 @@ function x264_predict_16x16_dc_top_neon vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] b pred16x16_dc_end -.endfunc +endfunc function x264_predict_16x16_dc_left_neon mov r1, #FDEC_STRIDE @@ -576,7 +576,7 @@ function x264_predict_16x16_dc_left_neon vrshrn.u16 d0, q0, #4 vdup.8 q0, d0[0] b pred16x16_dc_end -.endfunc +endfunc function x264_predict_16x16_dc_neon sub r3, r0, #FDEC_STRIDE @@ -614,7 +614,7 @@ pred16x16_dc_end: vst1.64 {d0-d1}, [r0,:128], r1 .endr bx lr -.endfunc +endfunc function x264_predict_16x16_h_neon sub r1, r0, #1 @@ -628,7 +628,7 @@ function x264_predict_16x16_h_neon vst1.64 {d2-d3}, [r0,:128], ip .endr bx lr -.endfunc +endfunc function x264_predict_16x16_v_neon sub r0, r0, #FDEC_STRIDE @@ -638,7 +638,7 @@ function x264_predict_16x16_v_neon vst1.64 {d0-d1}, [r0,:128], ip .endr bx lr -.endfunc +endfunc function x264_predict_16x16_p_neon sub r3, r0, #FDEC_STRIDE @@ -695,4 +695,4 @@ function x264_predict_16x16_p_neon subs r3, r3, #1 bne 1b bx lr -.endfunc +endfunc diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index 374796c7..d22a10ec 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -78,7 +78,7 @@ function x264_quant_2x2_dc_neon vsub.s16 d3, d3, d0 vst1.64 {d3}, [r0,:64] QUANT_END d3 -.endfunc +endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) function x264_quant_4x4_dc_neon @@ -90,7 +90,7 @@ function x264_quant_4x4_dc_neon QUANT_TWO q0, q0, d4, d5, d4, d5, q0 vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4_neon @@ -102,7 +102,7 @@ function x264_quant_4x4_neon QUANT_TWO q0, q1, d4, d5, d6, d7, q0 vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) function x264_quant_4x4x4_neon @@ -143,7 +143,7 @@ function x264_quant_4x4x4_neon orrne r0, #8 vpop {d8-d15} bx lr -.endfunc +endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function x264_quant_8x8_neon @@ -163,7 +163,7 @@ function x264_quant_8x8_neon .endr vorr d0, d0, d1 QUANT_END d0 -.endfunc +endfunc .macro DEQUANT_START mf_size offset dc=no mov r3, #0x2b @@ -255,7 +255,7 @@ dequant_\size\()_rshift_loop: bgt dequant_\size\()_rshift_loop .endif bx lr -.endfunc +endfunc .endm DEQUANT 4x4, 4 @@ -305,7 +305,7 @@ dequant_4x4_dc_rshift: vmovn.s32 d3, q13 vst1.16 {d0-d3}, [r0,:128] bx lr -.endfunc +endfunc // int coeff_last( int16_t *l ) @@ -317,7 +317,7 @@ function x264_coeff_last4_arm lsrs r2, r2, #16 addne r0, r0, #1 bx lr -.endfunc +endfunc function x264_coeff_last8_arm ldrd r2, r3, [r0, #8] @@ -331,7 +331,7 @@ function x264_coeff_last8_arm lsrs r2, r2, #16 addne r0, r0, #1 bx lr -.endfunc +endfunc .macro COEFF_LAST_1x size function x264_coeff_last\size\()_neon @@ -356,7 +356,7 @@ function x264_coeff_last\size\()_neon subslt r0, r3, r0, lsr #2 movlt r0, #0 bx lr -.endfunc +endfunc .endm COEFF_LAST_1x 15 @@ -405,4 +405,4 @@ function x264_coeff_last64_neon subslt r0, ip, r0 movlt r0, #0 bx lr -.endfunc +endfunc diff --git a/configure b/configure index 7d56fd8a..ebf737a7 100755 --- a/configure +++ b/configure @@ -197,8 +197,9 @@ cpp_check() { as_check() { log_check "whether $AS supports $1" - echo "$1" > conftest.asm - if $AS conftest.asm $ASFLAGS $2 -o conftest.o >conftest.log 2>&1; then + echo "$1" > conftest$AS_EXT + as_cmd="$AS conftest$AS_EXT $ASFLAGS $2 -o conftest.o" + if $as_cmd >conftest.log 2>&1; then res=$? log_ok else @@ -206,12 +207,12 @@ as_check() { log_fail log_msg "Failed commandline was:" log_msg "--------------------------------------------------" - log_msg "$AS conftest.asm $ASFLAGS $2 -o conftest.o" + log_msg "$as_cmd" cat conftest.log >> config.log log_msg "--------------------------------------------------" log_msg "Failed program was:" log_msg "--------------------------------------------------" - cat conftest.asm >> config.log + cat conftest$AS_EXT >> config.log log_msg "--------------------------------------------------" fi return $res @@ -302,10 +303,13 @@ HAVE_GETOPT_LONG=1 cross_prefix="" EXE="" +AS_EXT=".S" +NL=" +" # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ - LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM" + LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC" # parse options @@ -586,6 +590,7 @@ case $host_cpu in i*86) ARCH="X86" AS="yasm" + AS_EXT=".asm" ASFLAGS="$ASFLAGS -O2 -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then @@ -626,6 +631,7 @@ case $host_cpu in x86_64) ARCH="X86_64" AS="yasm" + AS_EXT=".asm" ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS" if [ "$SYS" = MACOSX ]; then @@ -649,6 +655,7 @@ case $host_cpu in if [ $asm = auto ] ; then define HAVE_ALTIVEC AS="${AS-${CC}}" + AS_EXT=".c" if [ $SYS = MACOSX ] ; then CFLAGS="$CFLAGS -faltivec -fastf -mcpu=G4" else @@ -760,6 +767,11 @@ if [ $asm = auto -a $ARCH = ARM ] ; then fi fi +if [ $asm = auto -a $ARCH = ARM ] ; then + # check if the assembler supports '.func' (clang 3.5 does not) + as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1 +fi + [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes"