]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/arm/h264dsp_neon.S
FFT: factor a shuffle out of the inner loop and merge it into fft_permute.
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
index edfce3a16809caa1a96691a8c1549d37312fb8b3..bd15ced7367ef3c5256ce5331a7749a8d50e6029 100644 (file)
@@ -183,7 +183,7 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
         bgt             5b
 
         pop             {r4-r7, pc}
-        .endfunc
+endfunc
         .endm
 
 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
@@ -317,9 +317,77 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         bgt             5b
 
         pop             {r4-r7, pc}
-        .endfunc
+endfunc
         .endm
 
+        .macro  h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+        push            {r4-r6, lr}
+        ldr             r4,  [sp, #16]
+        ldr             lr,  [sp, #20]
+        pld             [r1]
+        pld             [r1, r2]
+        orrs            r5,  r4,  lr
+        beq             2f
+
+        mul             r5,  r4,  lr
+        rsb             r6,  r5,  lr,  lsl #3
+        rsb             r12, r5,  r4,  lsl #3
+        sub             r4,  r5,  r4,  lsl #3
+        sub             r4,  r4,  lr,  lsl #3
+        add             r4,  r4,  #64
+        vdup.8          d0,  r4
+        vdup.8          d2,  r12
+        vdup.8          d1,  r6
+        vdup.8          d3,  r5
+        vtrn.16         q0,  q1
+1:
+        vld1.32         {d4[0]},  [r1], r2
+        vld1.32         {d4[1]},  [r1], r2
+        vrev64.32       d5,  d4
+        vld1.32         {d5[1]},  [r1]
+        vext.8          q3,  q2,  q2,  #1
+        vtrn.16         q2,  q3
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+.ifc \type,avg
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+.endif
+        vtrn.32         d16, d17
+        vadd.i16        d16, d16, d17
+        vrshrn.u16      d16, q8,  #6
+.ifc \type,avg
+        vrhadd.u8       d16, d16, d18
+.endif
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+        subs            r3,  r3,  #2
+        bgt             1b
+        pop             {r4-r6, pc}
+2:
+.ifc \type,put
+        ldrh            r5,  [r1], r2
+        strh            r5,  [r0], r2
+        ldrh            r6,  [r1], r2
+        strh            r6,  [r0], r2
+.else
+        vld1.16         {d16[0]}, [r1], r2
+        vld1.16         {d16[1]}, [r1], r2
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+        vrhadd.u8       d16, d16, d18
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+.endif
+        subs            r3,  r3,  #2
+        bgt             2b
+        pop             {r4-r6, pc}
+endfunc
+.endm
+
         .text
         .align
 
@@ -327,6 +395,8 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         h264_chroma_mc8 avg
         h264_chroma_mc4 put
         h264_chroma_mc4 avg
+        h264_chroma_mc2 put
+        h264_chroma_mc2 avg
 
         /* H.264 loop filter */
 
@@ -448,7 +518,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
 
         align_pop_regs
         bx              lr
-        .endfunc
+endfunc
 
 function ff_h264_h_loop_filter_luma_neon, export=1
         h264_loop_filter_start
@@ -500,7 +570,7 @@ function ff_h264_h_loop_filter_luma_neon, export=1
 
         align_pop_regs
         bx              lr
-        .endfunc
+endfunc
 
         .macro h264_loop_filter_chroma
         vdup.8          d22, r2         @ alpha
@@ -516,19 +586,17 @@ function ff_h264_h_loop_filter_luma_neon, export=1
         vclt.u8         d26, d26, d22   @ < alpha
         vsubw.u8        q2,  q2,  d2
         vdup.8          d22, r3         @ beta
-        vclt.s8         d25, d24, #0
         vrshrn.i16      d4,  q2,  #3
         vclt.u8         d28, d28, d22   @ < beta
-        vbic            d26, d26, d25
         vclt.u8         d30, d30, d22   @ < beta
-        vand            d26, d26, d28
-        vneg.s8         d25, d24
-        vand            d26, d26, d30
         vmin.s8         d4,  d4,  d24
-        vmovl.u8        q14, d16
-        vand            d4,  d4,  d26
+        vneg.s8         d25, d24
+        vand            d26, d26, d28
         vmax.s8         d4,  d4,  d25
+        vand            d26, d26, d30
         vmovl.u8        q11, d0
+        vand            d4,  d4,  d26
+        vmovl.u8        q14, d16
         vaddw.s8        q14, q14, d4
         vsubw.s8        q11, q11, d4
         vqmovun.s16     d16, q14
@@ -551,7 +619,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1
         vst1.64         {d0},  [r0,:64], r1
 
         bx              lr
-        .endfunc
+endfunc
 
 function ff_h264_h_loop_filter_chroma_neon, export=1
         h264_loop_filter_start
@@ -589,7 +657,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
         vst1.32         {d2[1]},  [r0], r1
 
         bx              lr
-        .endfunc
+endfunc
 
         /* H.264 qpel MC */
 
@@ -704,35 +772,48 @@ function put_h264_qpel16_h_lowpass_neon_packed
         mov             ip,  #16
         mov             lr,  r4
         b               put_h264_qpel8_h_lowpass_neon
-        .endfunc
+endfunc
 
-function put_h264_qpel16_h_lowpass_neon
+        .macro h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
         push            {lr}
         mov             ip,  #16
-        bl              put_h264_qpel8_h_lowpass_neon
+        bl              \type\()_h264_qpel8_h_lowpass_neon
         sub             r0,  r0,  r3, lsl #4
         sub             r1,  r1,  r2, lsl #4
         add             r0,  r0,  #8
         add             r1,  r1,  #8
         mov             ip,  #16
         pop             {lr}
-        .endfunc
+endfunc
 
-function put_h264_qpel8_h_lowpass_neon
+function \type\()_h264_qpel8_h_lowpass_neon
 1:      vld1.64         {d0, d1},  [r1], r2
         vld1.64         {d16,d17}, [r1], r2
         subs            ip,  ip,  #2
         lowpass_8       d0,  d1,  d16, d17, d0,  d16
+.ifc \type,avg
+        vld1.8          {d2},     [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d3},     [r0,:64]
+        vrhadd.u8       d16, d16, d3
+        sub             r0,  r0,  r3
+.endif
         vst1.64         {d0},     [r0,:64], r3
         vst1.64         {d16},    [r0,:64], r3
         bne             1b
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel_h_lowpass put
+        h264_qpel_h_lowpass avg
 
-function put_h264_qpel16_h_lowpass_l2_neon
+        .macro h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
         push            {lr}
         mov             ip,  #16
-        bl              put_h264_qpel8_h_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
         sub             r0,  r0,  r2, lsl #4
         sub             r1,  r1,  r2, lsl #4
         sub             r3,  r3,  r2, lsl #4
@@ -741,9 +822,9 @@ function put_h264_qpel16_h_lowpass_l2_neon
         add             r3,  r3,  #8
         mov             ip,  #16
         pop             {lr}
-        .endfunc
+endfunc
 
-function put_h264_qpel8_h_lowpass_l2_neon
+function \type\()_h264_qpel8_h_lowpass_l2_neon
 1:      vld1.64         {d0, d1},  [r1], r2
         vld1.64         {d16,d17}, [r1], r2
         vld1.64         {d28},     [r3], r2
@@ -751,11 +832,22 @@ function put_h264_qpel8_h_lowpass_l2_neon
         subs            ip,  ip,  #2
         lowpass_8       d0,  d1,  d16, d17, d0,  d1
         vrhadd.u8       q0,  q0,  q14
+.ifc \type,avg
+        vld1.8          {d2},      [r0,:64], r2
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d3},      [r0,:64]
+        vrhadd.u8       d1,  d1,  d3
+        sub             r0,  r0,  r2
+.endif
         vst1.64         {d0},      [r0,:64], r2
         vst1.64         {d1},      [r0,:64], r2
         bne             1b
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel_h_lowpass_l2 put
+        h264_qpel_h_lowpass_l2 avg
 
 function put_h264_qpel16_v_lowpass_neon_packed
         mov             r4,  lr
@@ -770,24 +862,25 @@ function put_h264_qpel16_v_lowpass_neon_packed
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r4
         b               put_h264_qpel8_v_lowpass_neon
-        .endfunc
+endfunc
 
-function put_h264_qpel16_v_lowpass_neon
+        .macro h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
         mov             r4,  lr
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         sub             r0,  r0,  r2, lsl #4
         add             r0,  r0,  #8
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r4
-        .endfunc
+endfunc
 
-function put_h264_qpel8_v_lowpass_neon
+function \type\()_h264_qpel8_v_lowpass_neon
         vld1.64         {d8},  [r1], r3
         vld1.64         {d10}, [r1], r3
         vld1.64         {d12}, [r1], r3
@@ -809,6 +902,26 @@ function put_h264_qpel8_v_lowpass_neon
         lowpass_8       d26, d27, d28, d29, d26, d28
         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
 
+.ifc \type,avg
+        vld1.8          {d9},  [r0,:64], r2
+        vrhadd.u8       d8,  d8,  d9
+        vld1.8          {d11}, [r0,:64], r2
+        vrhadd.u8       d10, d10, d11
+        vld1.8          {d13}, [r0,:64], r2
+        vrhadd.u8       d12, d12, d13
+        vld1.8          {d15}, [r0,:64], r2
+        vrhadd.u8       d14, d14, d15
+        vld1.8          {d23}, [r0,:64], r2
+        vrhadd.u8       d22, d22, d23
+        vld1.8          {d25}, [r0,:64], r2
+        vrhadd.u8       d24, d24, d25
+        vld1.8          {d27}, [r0,:64], r2
+        vrhadd.u8       d26, d26, d27
+        vld1.8          {d29}, [r0,:64], r2
+        vrhadd.u8       d28, d28, d29
+        sub             r0,  r0,  r2,  lsl #3
+.endif
+
         vst1.64         {d8},  [r0,:64], r2
         vst1.64         {d10}, [r0,:64], r2
         vst1.64         {d12}, [r0,:64], r2
@@ -819,13 +932,18 @@ function put_h264_qpel8_v_lowpass_neon
         vst1.64         {d28}, [r0,:64], r2
 
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel_v_lowpass put
+        h264_qpel_v_lowpass avg
 
-function put_h264_qpel16_v_lowpass_l2_neon
+        .macro h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
         mov             r4,  lr
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         sub             r0,  r0,  r3, lsl #4
         sub             ip,  ip,  r2, lsl #4
         add             r0,  r0,  #8
@@ -833,12 +951,12 @@ function put_h264_qpel16_v_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r4
-        .endfunc
+endfunc
 
-function put_h264_qpel8_v_lowpass_l2_neon
+function \type\()_h264_qpel8_v_lowpass_l2_neon
         vld1.64         {d8},  [r1], r3
         vld1.64         {d10}, [r1], r3
         vld1.64         {d12}, [r1], r3
@@ -871,10 +989,30 @@ function put_h264_qpel8_v_lowpass_l2_neon
         vld1.64         {d10}, [ip], r2
         vrhadd.u8       q2,  q2,  q11
         vld1.64         {d11}, [ip], r2
+        vrhadd.u8       q5,  q5,  q13
+
+.ifc \type,avg
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d1,  d1,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d2,  d2,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d3,  d3,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d4,  d4,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d5,  d5,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d10, d10, d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d11, d11, d17
+        sub             r0,  r0,  r3,  lsl #3
+.endif
 
         vst1.64         {d0},  [r0,:64], r3
         vst1.64         {d1},  [r0,:64], r3
-        vrhadd.u8       q5,  q5,  q13
         vst1.64         {d2},  [r0,:64], r3
         vst1.64         {d3},  [r0,:64], r3
         vst1.64         {d4},  [r0,:64], r3
@@ -883,7 +1021,11 @@ function put_h264_qpel8_v_lowpass_l2_neon
         vst1.64         {d11}, [r0,:64], r3
 
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel_v_lowpass_l2 put
+        h264_qpel_v_lowpass_l2 avg
 
 function put_h264_qpel8_hv_lowpass_neon_top
         lowpass_const   ip
@@ -949,11 +1091,31 @@ function put_h264_qpel8_hv_lowpass_neon_top
         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
 
         bx              lr
-        .endfunc
+endfunc
 
-function put_h264_qpel8_hv_lowpass_neon
+        .macro h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
         mov             r10, lr
         bl              put_h264_qpel8_hv_lowpass_neon_top
+.ifc \type,avg
+        vld1.8          {d0},      [r0,:64], r2
+        vrhadd.u8       d12, d12, d0
+        vld1.8          {d1},      [r0,:64], r2
+        vrhadd.u8       d13, d13, d1
+        vld1.8          {d2},      [r0,:64], r2
+        vrhadd.u8       d14, d14, d2
+        vld1.8          {d3},      [r0,:64], r2
+        vrhadd.u8       d15, d15, d3
+        vld1.8          {d4},      [r0,:64], r2
+        vrhadd.u8       d8,  d8,  d4
+        vld1.8          {d5},      [r0,:64], r2
+        vrhadd.u8       d9,  d9,  d5
+        vld1.8          {d6},      [r0,:64], r2
+        vrhadd.u8       d10, d10, d6
+        vld1.8          {d7},      [r0,:64], r2
+        vrhadd.u8       d11, d11, d7
+        sub             r0,  r0,  r2,  lsl #3
+.endif
         vst1.64         {d12},     [r0,:64], r2
         vst1.64         {d13},     [r0,:64], r2
         vst1.64         {d14},     [r0,:64], r2
@@ -965,9 +1127,14 @@ function put_h264_qpel8_hv_lowpass_neon
 
         mov             lr,  r10
         bx              lr
-        .endfunc
+endfunc
+        .endm
 
-function put_h264_qpel8_hv_lowpass_l2_neon
+        h264_qpel8_hv_lowpass put
+        h264_qpel8_hv_lowpass avg
+
+        .macro h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
         mov             r10, lr
         bl              put_h264_qpel8_hv_lowpass_neon_top
 
@@ -978,9 +1145,27 @@ function put_h264_qpel8_hv_lowpass_l2_neon
         vrhadd.u8       q1,  q1,  q7
         vld1.64         {d6, d7},  [r2,:128]!
         vrhadd.u8       q2,  q2,  q4
-
-        vst1.64         {d0},      [r0,:64], r3
         vrhadd.u8       q3,  q3,  q5
+.ifc \type,avg
+        vld1.8          {d16},     [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d16
+        vld1.8          {d17},     [r0,:64], r3
+        vrhadd.u8       d1,  d1,  d17
+        vld1.8          {d18},     [r0,:64], r3
+        vrhadd.u8       d2,  d2,  d18
+        vld1.8          {d19},     [r0,:64], r3
+        vrhadd.u8       d3,  d3,  d19
+        vld1.8          {d20},     [r0,:64], r3
+        vrhadd.u8       d4,  d4,  d20
+        vld1.8          {d21},     [r0,:64], r3
+        vrhadd.u8       d5,  d5,  d21
+        vld1.8          {d22},     [r0,:64], r3
+        vrhadd.u8       d6,  d6,  d22
+        vld1.8          {d23},     [r0,:64], r3
+        vrhadd.u8       d7,  d7,  d23
+        sub             r0,  r0,  r3,  lsl #3
+.endif
+        vst1.64         {d0},      [r0,:64], r3
         vst1.64         {d1},      [r0,:64], r3
         vst1.64         {d2},      [r0,:64], r3
         vst1.64         {d3},      [r0,:64], r3
@@ -991,81 +1176,91 @@ function put_h264_qpel8_hv_lowpass_l2_neon
 
         mov             lr,  r10
         bx              lr
-        .endfunc
+endfunc
+        .endm
 
-function put_h264_qpel16_hv_lowpass_neon
+        h264_qpel8_hv_lowpass_l2 put
+        h264_qpel8_hv_lowpass_l2 avg
+
+        .macro h264_qpel16_hv type
+function \type\()_h264_qpel16_hv_lowpass_neon
         mov             r9,  lr
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
         sub             r0,  r0,  r2, lsl #4
         add             r0,  r0,  #8
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r9
-        b               put_h264_qpel8_hv_lowpass_neon
-        .endfunc
+        b               \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
 
-function put_h264_qpel16_hv_lowpass_l2_neon
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
         mov             r9,  lr
         sub             r2,  r4,  #256
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
         sub             r0,  r0,  r3, lsl #4
         add             r0,  r0,  #8
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r9
-        b               put_h264_qpel8_hv_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+        .endm
+
+        h264_qpel16_hv put
+        h264_qpel16_hv avg
 
-function ff_put_h264_qpel8_mc10_neon, export=1
+        .macro h264_qpel8 type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
         lowpass_const   r3
         mov             r3,  r1
         sub             r1,  r1,  #2
         mov             ip,  #8
-        b               put_h264_qpel8_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel8_mc20_neon, export=1
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
         lowpass_const   r3
         sub             r1,  r1,  #2
         mov             r3,  r2
         mov             ip,  #8
-        b               put_h264_qpel8_h_lowpass_neon
-        .endfunc
+        b               \type\()_h264_qpel8_h_lowpass_neon
+endfunc
 
-function ff_put_h264_qpel8_mc30_neon, export=1
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
         lowpass_const   r3
         add             r3,  r1,  #1
         sub             r1,  r1,  #2
         mov             ip,  #8
-        b               put_h264_qpel8_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel8_mc01_neon, export=1
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
         push            {lr}
         mov             ip,  r1
-put_h264_qpel8_mc01:
+\type\()_h264_qpel8_mc01:
         lowpass_const   r3
         mov             r3,  r2
         sub             r1,  r1,  r2, lsl #1
         vpush           {d8-d15}
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         vpop            {d8-d15}
         pop             {pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc11_neon, export=1
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
         push            {r0, r1, r11, lr}
-put_h264_qpel8_mc11:
+\type\()_h264_qpel8_mc11:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1081,15 +1276,15 @@ put_h264_qpel8_mc11:
         add             ip,  sp,  #64
         sub             r1,  r1,  r2, lsl #1
         mov             r2,  #8
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11, #8
         pop             {r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc21_neon, export=1
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
-put_h264_qpel8_mc21:
+\type\()_h264_qpel8_mc21:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1106,33 +1301,33 @@ put_h264_qpel8_mc21:
         sub             r1,  r1,  #2
         mov             r3,  r2
         sub             r2,  r4,  #64
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4, r10, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc31_neon, export=1
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
         add             r1,  r1,  #1
         push            {r0, r1, r11, lr}
         sub             r1,  r1,  #1
-        b               put_h264_qpel8_mc11
-        .endfunc
+        b               \type\()_h264_qpel8_mc11
+endfunc
 
-function ff_put_h264_qpel8_mc02_neon, export=1
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
         push            {lr}
         lowpass_const   r3
         sub             r1,  r1,  r2, lsl #1
         mov             r3,  r2
         vpush           {d8-d15}
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         vpop            {d8-d15}
         pop             {pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc12_neon, export=1
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
-put_h264_qpel8_mc12:
+\type\()_h264_qpel8_mc12:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1148,13 +1343,13 @@ put_h264_qpel8_mc12:
         sub             r1,  r1,  r3, lsl #1
         sub             r1,  r1,  #2
         sub             r2,  r4,  #64
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4, r10, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc22_neon, export=1
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
         push            {r4, r10, r11, lr}
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1164,81 +1359,86 @@ function ff_put_h264_qpel8_mc22_neon, export=1
         sub             sp,  sp,  #(16*12)
         mov             r4,  sp
         vpush           {d8-d15}
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         vpop            {d8-d15}
         mov             sp,  r11
         pop             {r4, r10, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc32_neon, export=1
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
         add             r1,  r1,  #1
-        b               put_h264_qpel8_mc12
-        .endfunc
+        b               \type\()_h264_qpel8_mc12
+endfunc
 
-function ff_put_h264_qpel8_mc03_neon, export=1
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
         push            {lr}
         add             ip,  r1,  r2
-        b               put_h264_qpel8_mc01
-        .endfunc
+        b               \type\()_h264_qpel8_mc01
+endfunc
 
-function ff_put_h264_qpel8_mc13_neon, export=1
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
         push            {r0, r1, r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel8_mc11
-        .endfunc
+        b               \type\()_h264_qpel8_mc11
+endfunc
 
-function ff_put_h264_qpel8_mc23_neon, export=1
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel8_mc21
-        .endfunc
+        b               \type\()_h264_qpel8_mc21
+endfunc
 
-function ff_put_h264_qpel8_mc33_neon, export=1
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
         add             r1,  r1,  #1
         push            {r0, r1, r11, lr}
         add             r1,  r1,  r2
         sub             r1,  r1,  #1
-        b               put_h264_qpel8_mc11
-        .endfunc
+        b               \type\()_h264_qpel8_mc11
+endfunc
+        .endm
+
+        h264_qpel8 put
+        h264_qpel8 avg
 
-function ff_put_h264_qpel16_mc10_neon, export=1
+        .macro h264_qpel16 type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
         lowpass_const   r3
         mov             r3,  r1
         sub             r1,  r1,  #2
-        b               put_h264_qpel16_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel16_mc20_neon, export=1
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
         lowpass_const   r3
         sub             r1,  r1,  #2
         mov             r3,  r2
-        b               put_h264_qpel16_h_lowpass_neon
-        .endfunc
+        b               \type\()_h264_qpel16_h_lowpass_neon
+endfunc
 
-function ff_put_h264_qpel16_mc30_neon, export=1
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
         lowpass_const   r3
         add             r3,  r1,  #1
         sub             r1,  r1,  #2
-        b               put_h264_qpel16_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel16_mc01_neon, export=1
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
         push            {r4, lr}
         mov             ip,  r1
-put_h264_qpel16_mc01:
+\type\()_h264_qpel16_mc01:
         lowpass_const   r3
         mov             r3,  r2
         sub             r1,  r1,  r2, lsl #1
         vpush           {d8-d15}
-        bl              put_h264_qpel16_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
         vpop            {d8-d15}
         pop             {r4, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc11_neon, export=1
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
         push            {r0, r1, r4, r11, lr}
-put_h264_qpel16_mc11:
+\type\()_h264_qpel16_mc11:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1253,15 +1453,15 @@ put_h264_qpel16_mc11:
         add             ip,  sp,  #64
         sub             r1,  r1,  r2, lsl #1
         mov             r2,  #16
-        bl              put_h264_qpel16_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11, #8
         pop             {r4, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc21_neon, export=1
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
-put_h264_qpel16_mc21:
+\type\()_h264_qpel16_mc21:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1275,33 +1475,33 @@ put_h264_qpel16_mc21:
         sub             r1,  r1,  r2, lsl #1
         sub             r1,  r1,  #2
         mov             r3,  r2
-        bl              put_h264_qpel16_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4-r5, r9-r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc31_neon, export=1
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
         add             r1,  r1,  #1
         push            {r0, r1, r4, r11, lr}
         sub             r1,  r1,  #1
-        b               put_h264_qpel16_mc11
-        .endfunc
+        b               \type\()_h264_qpel16_mc11
+endfunc
 
-function ff_put_h264_qpel16_mc02_neon, export=1
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
         push            {r4, lr}
         lowpass_const   r3
         sub             r1,  r1,  r2, lsl #1
         mov             r3,  r2
         vpush           {d8-d15}
-        bl              put_h264_qpel16_v_lowpass_neon
+        bl              \type\()_h264_qpel16_v_lowpass_neon
         vpop            {d8-d15}
         pop             {r4, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc12_neon, export=1
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
-put_h264_qpel16_mc12:
+\type\()_h264_qpel16_mc12:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1316,13 +1516,13 @@ put_h264_qpel16_mc12:
         sub             r1,  r1,  r3, lsl #1
         sub             r1,  r1,  #2
         mov             r2,  r3
-        bl              put_h264_qpel16_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4-r5, r9-r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc22_neon, export=1
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
         push            {r4, r9-r11, lr}
         lowpass_const   r3
         mov             r11, sp
@@ -1333,43 +1533,47 @@ function ff_put_h264_qpel16_mc22_neon, export=1
         sub             sp,  sp,  #(16*12)
         mov             r4,  sp
         vpush           {d8-d15}
-        bl              put_h264_qpel16_hv_lowpass_neon
+        bl              \type\()_h264_qpel16_hv_lowpass_neon
         vpop            {d8-d15}
         mov             sp,  r11
         pop             {r4, r9-r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc32_neon, export=1
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
         add             r1,  r1,  #1
-        b               put_h264_qpel16_mc12
-        .endfunc
+        b               \type\()_h264_qpel16_mc12
+endfunc
 
-function ff_put_h264_qpel16_mc03_neon, export=1
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
         push            {r4, lr}
         add             ip,  r1,  r2
-        b               put_h264_qpel16_mc01
-        .endfunc
+        b               \type\()_h264_qpel16_mc01
+endfunc
 
-function ff_put_h264_qpel16_mc13_neon, export=1
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
         push            {r0, r1, r4, r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel16_mc11
-        .endfunc
+        b               \type\()_h264_qpel16_mc11
+endfunc
 
-function ff_put_h264_qpel16_mc23_neon, export=1
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel16_mc21
-        .endfunc
+        b               \type\()_h264_qpel16_mc21
+endfunc
 
-function ff_put_h264_qpel16_mc33_neon, export=1
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
         add             r1,  r1,  #1
         push            {r0, r1, r4, r11, lr}
         add             r1,  r1,  r2
         sub             r1,  r1,  #1
-        b               put_h264_qpel16_mc11
-        .endfunc
+        b               \type\()_h264_qpel16_mc11
+endfunc
+        .endm
+
+        h264_qpel16 put
+        h264_qpel16 avg
 
 @ Biweighted prediction
 
@@ -1513,7 +1717,7 @@ function biweight_h264_pixels_\w\()_neon
         biweight_\w     vmlsl.u8, vmlsl.u8
 40:     rsb             r5,  r5,  #0
         biweight_\w     vmlsl.u8, vmlal.u8
-        .endfunc
+endfunc
         .endm
 
         .macro  biweight_entry w, h, b=1
@@ -1522,7 +1726,7 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
 .if \b
         b               biweight_h264_pixels_\w\()_neon
 .endif
-        .endfunc
+endfunc
         .endm
 
         biweight_entry  16, 8
@@ -1650,7 +1854,7 @@ function weight_h264_pixels_\w\()_neon
         weight_\w       vadd.s16
 10:     rsb             r3,  r3,  #0
         weight_\w       vsub.s16
-        .endfunc
+endfunc
         .endm
 
         .macro  weight_entry w, h, b=1
@@ -1659,7 +1863,7 @@ function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
 .if \b
         b               weight_h264_pixels_\w\()_neon
 .endif
-        .endfunc
+endfunc
         .endm
 
         weight_entry    16, 8