]> git.sesse.net Git - ffmpeg/blobdiff - libavcodec/arm/h264dsp_neon.S
ARM: NEON clear_block[s]
[ffmpeg] / libavcodec / arm / h264dsp_neon.S
index 616a8132e535266fefd36267703c6749d7749ade..d9cdad8e0570b68bdb4439e34607987e75ecf6af 100644 (file)
@@ -20,8 +20,6 @@
 
 #include "asm.S"
 
-        .fpu neon
-
         .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
         vtrn.32         \r0, \r4
         vtrn.32         \r1, \r5
         vtrn.8          \r6, \r7
         .endm
 
+        .macro transpose_4x4 r0 r1 r2 r3
+        vtrn.16         \r0, \r2
+        vtrn.16         \r1, \r3
+        vtrn.8          \r0, \r1
+        vtrn.8          \r2, \r3
+        .endm
+
         .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
         vswp            \r0, \r4
         vswp            \r1, \r5
@@ -178,7 +183,7 @@ function ff_\type\()_h264_chroma_mc8_neon, export=1
         bgt             5b
 
         pop             {r4-r7, pc}
-        .endfunc
+endfunc
         .endm
 
 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
@@ -312,9 +317,77 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         bgt             5b
 
         pop             {r4-r7, pc}
-        .endfunc
+endfunc
         .endm
 
+        .macro  h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+        push            {r4-r6, lr}
+        ldr             r4,  [sp, #16]
+        ldr             lr,  [sp, #20]
+        pld             [r1]
+        pld             [r1, r2]
+        orrs            r5,  r4,  lr
+        beq             2f
+
+        mul             r5,  r4,  lr
+        rsb             r6,  r5,  lr,  lsl #3
+        rsb             r12, r5,  r4,  lsl #3
+        sub             r4,  r5,  r4,  lsl #3
+        sub             r4,  r4,  lr,  lsl #3
+        add             r4,  r4,  #64
+        vdup.8          d0,  r4
+        vdup.8          d2,  r12
+        vdup.8          d1,  r6
+        vdup.8          d3,  r5
+        vtrn.16         q0,  q1
+1:
+        vld1.32         {d4[0]},  [r1], r2
+        vld1.32         {d4[1]},  [r1], r2
+        vrev64.32       d5,  d4
+        vld1.32         {d5[1]},  [r1]
+        vext.8          q3,  q2,  q2,  #1
+        vtrn.16         q2,  q3
+        vmull.u8        q8,  d4,  d0
+        vmlal.u8        q8,  d5,  d1
+.ifc \type,avg
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+.endif
+        vtrn.32         d16, d17
+        vadd.i16        d16, d16, d17
+        vrshrn.u16      d16, q8,  #6
+.ifc \type,avg
+        vrhadd.u8       d16, d16, d18
+.endif
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+        subs            r3,  r3,  #2
+        bgt             1b
+        pop             {r4-r6, pc}
+2:
+.ifc \type,put
+        ldrh            r5,  [r1], r2
+        strh            r5,  [r0], r2
+        ldrh            r6,  [r1], r2
+        strh            r6,  [r0], r2
+.else
+        vld1.16         {d16[0]}, [r1], r2
+        vld1.16         {d16[1]}, [r1], r2
+        vld1.16         {d18[0]}, [r0,:16], r2
+        vld1.16         {d18[1]}, [r0,:16]
+        sub             r0,  r0,  r2
+        vrhadd.u8       d16, d16, d18
+        vst1.16         {d16[0]}, [r0,:16], r2
+        vst1.16         {d16[1]}, [r0,:16], r2
+.endif
+        subs            r3,  r3,  #2
+        bgt             2b
+        pop             {r4-r6, pc}
+endfunc
+.endm
+
         .text
         .align
 
@@ -322,6 +395,8 @@ function ff_\type\()_h264_chroma_mc4_neon, export=1
         h264_chroma_mc8 avg
         h264_chroma_mc4 put
         h264_chroma_mc4 avg
+        h264_chroma_mc2 put
+        h264_chroma_mc2 avg
 
         /* H.264 loop filter */
 
@@ -443,7 +518,7 @@ function ff_h264_v_loop_filter_luma_neon, export=1
 
         align_pop_regs
         bx              lr
-        .endfunc
+endfunc
 
 function ff_h264_h_loop_filter_luma_neon, export=1
         h264_loop_filter_start
@@ -469,39 +544,33 @@ function ff_h264_h_loop_filter_luma_neon, export=1
         transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
 
         align_push_regs
-        sub             sp,  sp,  #16
-        vst1.64         {d4, d5},  [sp,:128]
-        sub             sp,  sp,  #16
-        vst1.64         {d20,d21}, [sp,:128]
 
         h264_loop_filter_luma
 
-        vld1.64         {d20,d21}, [sp,:128]!
-        vld1.64         {d4, d5},  [sp,:128]!
-
-        transpose_8x8   q3, q10, q4, q8, q0, q5, q2, q13
+        transpose_4x4   q4, q8, q0, q5
 
         sub             r0,  r0,  r1, lsl #4
-        vst1.64         {d6},  [r0], r1
-        vst1.64         {d20}, [r0], r1
-        vst1.64         {d8},  [r0], r1
-        vst1.64         {d16}, [r0], r1
-        vst1.64         {d0},  [r0], r1
-        vst1.64         {d10}, [r0], r1
-        vst1.64         {d4},  [r0], r1
-        vst1.64         {d26}, [r0], r1
-        vst1.64         {d7},  [r0], r1
-        vst1.64         {d21}, [r0], r1
-        vst1.64         {d9},  [r0], r1
-        vst1.64         {d17}, [r0], r1
-        vst1.64         {d1},  [r0], r1
-        vst1.64         {d11}, [r0], r1
-        vst1.64         {d5},  [r0], r1
-        vst1.64         {d27}, [r0], r1
+        add             r0,  r0,  #2
+        vst1.32         {d8[0]},  [r0], r1
+        vst1.32         {d16[0]}, [r0], r1
+        vst1.32         {d0[0]},  [r0], r1
+        vst1.32         {d10[0]}, [r0], r1
+        vst1.32         {d8[1]},  [r0], r1
+        vst1.32         {d16[1]}, [r0], r1
+        vst1.32         {d0[1]},  [r0], r1
+        vst1.32         {d10[1]}, [r0], r1
+        vst1.32         {d9[0]},  [r0], r1
+        vst1.32         {d17[0]}, [r0], r1
+        vst1.32         {d1[0]},  [r0], r1
+        vst1.32         {d11[0]}, [r0], r1
+        vst1.32         {d9[1]},  [r0], r1
+        vst1.32         {d17[1]}, [r0], r1
+        vst1.32         {d1[1]},  [r0], r1
+        vst1.32         {d11[1]}, [r0], r1
 
         align_pop_regs
         bx              lr
-        .endfunc
+endfunc
 
         .macro h264_loop_filter_chroma
         vdup.8          d22, r2         @ alpha
@@ -552,7 +621,7 @@ function ff_h264_v_loop_filter_chroma_neon, export=1
         vst1.64         {d0},  [r0,:64], r1
 
         bx              lr
-        .endfunc
+endfunc
 
 function ff_h264_h_loop_filter_chroma_neon, export=1
         h264_loop_filter_start
@@ -590,7 +659,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
         vst1.32         {d2[1]},  [r0], r1
 
         bx              lr
-        .endfunc
+endfunc
 
         /* H.264 qpel MC */
 
@@ -705,35 +774,48 @@ function put_h264_qpel16_h_lowpass_neon_packed
         mov             ip,  #16
         mov             lr,  r4
         b               put_h264_qpel8_h_lowpass_neon
-        .endfunc
+endfunc
 
-function put_h264_qpel16_h_lowpass_neon
+        .macro h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
         push            {lr}
         mov             ip,  #16
-        bl              put_h264_qpel8_h_lowpass_neon
+        bl              \type\()_h264_qpel8_h_lowpass_neon
         sub             r0,  r0,  r3, lsl #4
         sub             r1,  r1,  r2, lsl #4
         add             r0,  r0,  #8
         add             r1,  r1,  #8
         mov             ip,  #16
         pop             {lr}
-        .endfunc
+endfunc
 
-function put_h264_qpel8_h_lowpass_neon
+function \type\()_h264_qpel8_h_lowpass_neon
 1:      vld1.64         {d0, d1},  [r1], r2
         vld1.64         {d16,d17}, [r1], r2
         subs            ip,  ip,  #2
         lowpass_8       d0,  d1,  d16, d17, d0,  d16
+.ifc \type,avg
+        vld1.8          {d2},     [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d3},     [r0,:64]
+        vrhadd.u8       d16, d16, d3
+        sub             r0,  r0,  r3
+.endif
         vst1.64         {d0},     [r0,:64], r3
         vst1.64         {d16},    [r0,:64], r3
         bne             1b
         bx              lr
-        .endfunc
+endfunc
+        .endm
 
-function put_h264_qpel16_h_lowpass_l2_neon
+        h264_qpel_h_lowpass put
+        h264_qpel_h_lowpass avg
+
+        .macro h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
         push            {lr}
         mov             ip,  #16
-        bl              put_h264_qpel8_h_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
         sub             r0,  r0,  r2, lsl #4
         sub             r1,  r1,  r2, lsl #4
         sub             r3,  r3,  r2, lsl #4
@@ -742,9 +824,9 @@ function put_h264_qpel16_h_lowpass_l2_neon
         add             r3,  r3,  #8
         mov             ip,  #16
         pop             {lr}
-        .endfunc
+endfunc
 
-function put_h264_qpel8_h_lowpass_l2_neon
+function \type\()_h264_qpel8_h_lowpass_l2_neon
 1:      vld1.64         {d0, d1},  [r1], r2
         vld1.64         {d16,d17}, [r1], r2
         vld1.64         {d28},     [r3], r2
@@ -752,11 +834,22 @@ function put_h264_qpel8_h_lowpass_l2_neon
         subs            ip,  ip,  #2
         lowpass_8       d0,  d1,  d16, d17, d0,  d1
         vrhadd.u8       q0,  q0,  q14
+.ifc \type,avg
+        vld1.8          {d2},      [r0,:64], r2
+        vrhadd.u8       d0,  d0,  d2
+        vld1.8          {d3},      [r0,:64]
+        vrhadd.u8       d1,  d1,  d3
+        sub             r0,  r0,  r2
+.endif
         vst1.64         {d0},      [r0,:64], r2
         vst1.64         {d1},      [r0,:64], r2
         bne             1b
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel_h_lowpass_l2 put
+        h264_qpel_h_lowpass_l2 avg
 
 function put_h264_qpel16_v_lowpass_neon_packed
         mov             r4,  lr
@@ -771,24 +864,25 @@ function put_h264_qpel16_v_lowpass_neon_packed
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r4
         b               put_h264_qpel8_v_lowpass_neon
-        .endfunc
+endfunc
 
-function put_h264_qpel16_v_lowpass_neon
+        .macro h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
         mov             r4,  lr
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         sub             r0,  r0,  r2, lsl #4
         add             r0,  r0,  #8
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r4
-        .endfunc
+endfunc
 
-function put_h264_qpel8_v_lowpass_neon
+function \type\()_h264_qpel8_v_lowpass_neon
         vld1.64         {d8},  [r1], r3
         vld1.64         {d10}, [r1], r3
         vld1.64         {d12}, [r1], r3
@@ -810,6 +904,26 @@ function put_h264_qpel8_v_lowpass_neon
         lowpass_8       d26, d27, d28, d29, d26, d28
         transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
 
+.ifc \type,avg
+        vld1.8          {d9},  [r0,:64], r2
+        vrhadd.u8       d8,  d8,  d9
+        vld1.8          {d11}, [r0,:64], r2
+        vrhadd.u8       d10, d10, d11
+        vld1.8          {d13}, [r0,:64], r2
+        vrhadd.u8       d12, d12, d13
+        vld1.8          {d15}, [r0,:64], r2
+        vrhadd.u8       d14, d14, d15
+        vld1.8          {d23}, [r0,:64], r2
+        vrhadd.u8       d22, d22, d23
+        vld1.8          {d25}, [r0,:64], r2
+        vrhadd.u8       d24, d24, d25
+        vld1.8          {d27}, [r0,:64], r2
+        vrhadd.u8       d26, d26, d27
+        vld1.8          {d29}, [r0,:64], r2
+        vrhadd.u8       d28, d28, d29
+        sub             r0,  r0,  r2,  lsl #3
+.endif
+
         vst1.64         {d8},  [r0,:64], r2
         vst1.64         {d10}, [r0,:64], r2
         vst1.64         {d12}, [r0,:64], r2
@@ -820,13 +934,18 @@ function put_h264_qpel8_v_lowpass_neon
         vst1.64         {d28}, [r0,:64], r2
 
         bx              lr
-        .endfunc
+endfunc
+        .endm
 
-function put_h264_qpel16_v_lowpass_l2_neon
+        h264_qpel_v_lowpass put
+        h264_qpel_v_lowpass avg
+
+        .macro h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
         mov             r4,  lr
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         sub             r0,  r0,  r3, lsl #4
         sub             ip,  ip,  r2, lsl #4
         add             r0,  r0,  #8
@@ -834,12 +953,12 @@ function put_h264_qpel16_v_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r4
-        .endfunc
+endfunc
 
-function put_h264_qpel8_v_lowpass_l2_neon
+function \type\()_h264_qpel8_v_lowpass_l2_neon
         vld1.64         {d8},  [r1], r3
         vld1.64         {d10}, [r1], r3
         vld1.64         {d12}, [r1], r3
@@ -872,10 +991,30 @@ function put_h264_qpel8_v_lowpass_l2_neon
         vld1.64         {d10}, [ip], r2
         vrhadd.u8       q2,  q2,  q11
         vld1.64         {d11}, [ip], r2
+        vrhadd.u8       q5,  q5,  q13
+
+.ifc \type,avg
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d1,  d1,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d2,  d2,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d3,  d3,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d4,  d4,  d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d5,  d5,  d17
+        vld1.8          {d16}, [r0,:64], r3
+        vrhadd.u8       d10, d10, d16
+        vld1.8          {d17}, [r0,:64], r3
+        vrhadd.u8       d11, d11, d17
+        sub             r0,  r0,  r3,  lsl #3
+.endif
 
         vst1.64         {d0},  [r0,:64], r3
         vst1.64         {d1},  [r0,:64], r3
-        vrhadd.u8       q5,  q5,  q13
         vst1.64         {d2},  [r0,:64], r3
         vst1.64         {d3},  [r0,:64], r3
         vst1.64         {d4},  [r0,:64], r3
@@ -884,7 +1023,11 @@ function put_h264_qpel8_v_lowpass_l2_neon
         vst1.64         {d11}, [r0,:64], r3
 
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel_v_lowpass_l2 put
+        h264_qpel_v_lowpass_l2 avg
 
 function put_h264_qpel8_hv_lowpass_neon_top
         lowpass_const   ip
@@ -950,11 +1093,31 @@ function put_h264_qpel8_hv_lowpass_neon_top
         transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
 
         bx              lr
-        .endfunc
+endfunc
 
-function put_h264_qpel8_hv_lowpass_neon
+        .macro h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
         mov             r10, lr
         bl              put_h264_qpel8_hv_lowpass_neon_top
+.ifc \type,avg
+        vld1.8          {d0},      [r0,:64], r2
+        vrhadd.u8       d12, d12, d0
+        vld1.8          {d1},      [r0,:64], r2
+        vrhadd.u8       d13, d13, d1
+        vld1.8          {d2},      [r0,:64], r2
+        vrhadd.u8       d14, d14, d2
+        vld1.8          {d3},      [r0,:64], r2
+        vrhadd.u8       d15, d15, d3
+        vld1.8          {d4},      [r0,:64], r2
+        vrhadd.u8       d8,  d8,  d4
+        vld1.8          {d5},      [r0,:64], r2
+        vrhadd.u8       d9,  d9,  d5
+        vld1.8          {d6},      [r0,:64], r2
+        vrhadd.u8       d10, d10, d6
+        vld1.8          {d7},      [r0,:64], r2
+        vrhadd.u8       d11, d11, d7
+        sub             r0,  r0,  r2,  lsl #3
+.endif
         vst1.64         {d12},     [r0,:64], r2
         vst1.64         {d13},     [r0,:64], r2
         vst1.64         {d14},     [r0,:64], r2
@@ -966,9 +1129,14 @@ function put_h264_qpel8_hv_lowpass_neon
 
         mov             lr,  r10
         bx              lr
-        .endfunc
+endfunc
+        .endm
 
-function put_h264_qpel8_hv_lowpass_l2_neon
+        h264_qpel8_hv_lowpass put
+        h264_qpel8_hv_lowpass avg
+
+        .macro h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
         mov             r10, lr
         bl              put_h264_qpel8_hv_lowpass_neon_top
 
@@ -979,9 +1147,27 @@ function put_h264_qpel8_hv_lowpass_l2_neon
         vrhadd.u8       q1,  q1,  q7
         vld1.64         {d6, d7},  [r2,:128]!
         vrhadd.u8       q2,  q2,  q4
-
-        vst1.64         {d0},      [r0,:64], r3
         vrhadd.u8       q3,  q3,  q5
+.ifc \type,avg
+        vld1.8          {d16},     [r0,:64], r3
+        vrhadd.u8       d0,  d0,  d16
+        vld1.8          {d17},     [r0,:64], r3
+        vrhadd.u8       d1,  d1,  d17
+        vld1.8          {d18},     [r0,:64], r3
+        vrhadd.u8       d2,  d2,  d18
+        vld1.8          {d19},     [r0,:64], r3
+        vrhadd.u8       d3,  d3,  d19
+        vld1.8          {d20},     [r0,:64], r3
+        vrhadd.u8       d4,  d4,  d20
+        vld1.8          {d21},     [r0,:64], r3
+        vrhadd.u8       d5,  d5,  d21
+        vld1.8          {d22},     [r0,:64], r3
+        vrhadd.u8       d6,  d6,  d22
+        vld1.8          {d23},     [r0,:64], r3
+        vrhadd.u8       d7,  d7,  d23
+        sub             r0,  r0,  r3,  lsl #3
+.endif
+        vst1.64         {d0},      [r0,:64], r3
         vst1.64         {d1},      [r0,:64], r3
         vst1.64         {d2},      [r0,:64], r3
         vst1.64         {d3},      [r0,:64], r3
@@ -992,82 +1178,94 @@ function put_h264_qpel8_hv_lowpass_l2_neon
 
         mov             lr,  r10
         bx              lr
-        .endfunc
+endfunc
+        .endm
+
+        h264_qpel8_hv_lowpass_l2 put
+        h264_qpel8_hv_lowpass_l2 avg
 
-function put_h264_qpel16_hv_lowpass_neon
+        .macro h264_qpel16_hv type
+function \type\()_h264_qpel16_hv_lowpass_neon
         mov             r9,  lr
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
         sub             r0,  r0,  r2, lsl #4
         add             r0,  r0,  #8
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r9
-        b               put_h264_qpel8_hv_lowpass_neon
-        .endfunc
+        b               \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
 
-function put_h264_qpel16_hv_lowpass_l2_neon
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
         mov             r9,  lr
         sub             r2,  r4,  #256
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #4
         sub             r1,  r1,  r3, lsl #2
         add             r1,  r1,  #8
         sub             r0,  r0,  r3, lsl #4
         add             r0,  r0,  #8
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         sub             r1,  r1,  r3, lsl #2
         mov             lr,  r9
-        b               put_h264_qpel8_hv_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+        .endm
 
-function ff_put_h264_qpel8_mc10_neon, export=1
+        h264_qpel16_hv put
+        h264_qpel16_hv avg
+
+        .macro h264_qpel8 type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
         lowpass_const   r3
         mov             r3,  r1
         sub             r1,  r1,  #2
         mov             ip,  #8
-        b               put_h264_qpel8_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel8_mc20_neon, export=1
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
         lowpass_const   r3
         sub             r1,  r1,  #2
         mov             r3,  r2
         mov             ip,  #8
-        b               put_h264_qpel8_h_lowpass_neon
-        .endfunc
+        b               \type\()_h264_qpel8_h_lowpass_neon
+endfunc
 
-function ff_put_h264_qpel8_mc30_neon, export=1
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
         lowpass_const   r3
         add             r3,  r1,  #1
         sub             r1,  r1,  #2
         mov             ip,  #8
-        b               put_h264_qpel8_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel8_mc01_neon, export=1
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
         push            {lr}
         mov             ip,  r1
-put_h264_qpel8_mc01:
+\type\()_h264_qpel8_mc01:
         lowpass_const   r3
         mov             r3,  r2
         sub             r1,  r1,  r2, lsl #1
         vpush           {d8-d15}
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         vpop            {d8-d15}
         pop             {pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc11_neon, export=1
-        push            {r0, r1, r2, lr}
-put_h264_qpel8_mc11:
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+        push            {r0, r1, r11, lr}
+\type\()_h264_qpel8_mc11:
         lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
         sub             sp,  sp,  #64
         mov             r0,  sp
         sub             r1,  r1,  #2
@@ -1075,20 +1273,20 @@ put_h264_qpel8_mc11:
         mov             ip,  #8
         vpush           {d8-d15}
         bl              put_h264_qpel8_h_lowpass_neon
-        ldrd            r0,  [sp, #128]
+        ldrd            r0,  [r11]
         mov             r3,  r2
         add             ip,  sp,  #64
         sub             r1,  r1,  r2, lsl #1
         mov             r2,  #8
-        bl              put_h264_qpel8_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  sp,  #76
-        pop             {pc}
-        .endfunc
+        add             sp,  r11, #8
+        pop             {r11, pc}
+endfunc
 
-function ff_put_h264_qpel8_mc21_neon, export=1
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
-put_h264_qpel8_mc21:
+\type\()_h264_qpel8_mc21:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1105,33 +1303,33 @@ put_h264_qpel8_mc21:
         sub             r1,  r1,  #2
         mov             r3,  r2
         sub             r2,  r4,  #64
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4, r10, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc31_neon, export=1
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
         add             r1,  r1,  #1
-        push            {r0, r1, r2, lr}
+        push            {r0, r1, r11, lr}
         sub             r1,  r1,  #1
-        b               put_h264_qpel8_mc11
-        .endfunc
+        b               \type\()_h264_qpel8_mc11
+endfunc
 
-function ff_put_h264_qpel8_mc02_neon, export=1
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
         push            {lr}
         lowpass_const   r3
         sub             r1,  r1,  r2, lsl #1
         mov             r3,  r2
         vpush           {d8-d15}
-        bl              put_h264_qpel8_v_lowpass_neon
+        bl              \type\()_h264_qpel8_v_lowpass_neon
         vpop            {d8-d15}
         pop             {pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc12_neon, export=1
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
-put_h264_qpel8_mc12:
+\type\()_h264_qpel8_mc12:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1147,13 +1345,13 @@ put_h264_qpel8_mc12:
         sub             r1,  r1,  r3, lsl #1
         sub             r1,  r1,  #2
         sub             r2,  r4,  #64
-        bl              put_h264_qpel8_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4, r10, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc22_neon, export=1
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
         push            {r4, r10, r11, lr}
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1163,103 +1361,109 @@ function ff_put_h264_qpel8_mc22_neon, export=1
         sub             sp,  sp,  #(16*12)
         mov             r4,  sp
         vpush           {d8-d15}
-        bl              put_h264_qpel8_hv_lowpass_neon
+        bl              \type\()_h264_qpel8_hv_lowpass_neon
         vpop            {d8-d15}
         mov             sp,  r11
         pop             {r4, r10, r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel8_mc32_neon, export=1
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
         add             r1,  r1,  #1
-        b               put_h264_qpel8_mc12
-        .endfunc
+        b               \type\()_h264_qpel8_mc12
+endfunc
 
-function ff_put_h264_qpel8_mc03_neon, export=1
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
         push            {lr}
         add             ip,  r1,  r2
-        b               put_h264_qpel8_mc01
-        .endfunc
+        b               \type\()_h264_qpel8_mc01
+endfunc
 
-function ff_put_h264_qpel8_mc13_neon, export=1
-        push            {r0, r1, r2, lr}
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+        push            {r0, r1, r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel8_mc11
-        .endfunc
+        b               \type\()_h264_qpel8_mc11
+endfunc
 
-function ff_put_h264_qpel8_mc23_neon, export=1
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
         push            {r0, r1, r4, r10, r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel8_mc21
-        .endfunc
+        b               \type\()_h264_qpel8_mc21
+endfunc
 
-function ff_put_h264_qpel8_mc33_neon, export=1
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
         add             r1,  r1,  #1
-        push            {r0, r1, r2, lr}
+        push            {r0, r1, r11, lr}
         add             r1,  r1,  r2
         sub             r1,  r1,  #1
-        b               put_h264_qpel8_mc11
-        .endfunc
+        b               \type\()_h264_qpel8_mc11
+endfunc
+        .endm
 
-function ff_put_h264_qpel16_mc10_neon, export=1
+        h264_qpel8 put
+        h264_qpel8 avg
+
+        .macro h264_qpel16 type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
         lowpass_const   r3
         mov             r3,  r1
         sub             r1,  r1,  #2
-        b               put_h264_qpel16_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel16_mc20_neon, export=1
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
         lowpass_const   r3
         sub             r1,  r1,  #2
         mov             r3,  r2
-        b               put_h264_qpel16_h_lowpass_neon
-        .endfunc
+        b               \type\()_h264_qpel16_h_lowpass_neon
+endfunc
 
-function ff_put_h264_qpel16_mc30_neon, export=1
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
         lowpass_const   r3
         add             r3,  r1,  #1
         sub             r1,  r1,  #2
-        b               put_h264_qpel16_h_lowpass_l2_neon
-        .endfunc
+        b               \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
 
-function ff_put_h264_qpel16_mc01_neon, export=1
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
         push            {r4, lr}
         mov             ip,  r1
-put_h264_qpel16_mc01:
+\type\()_h264_qpel16_mc01:
         lowpass_const   r3
         mov             r3,  r2
         sub             r1,  r1,  r2, lsl #1
         vpush           {d8-d15}
-        bl              put_h264_qpel16_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
         vpop            {d8-d15}
         pop             {r4, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc11_neon, export=1
-        push            {r0, r1, r4, lr}
-put_h264_qpel16_mc11:
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+        push            {r0, r1, r4, r11, lr}
+\type\()_h264_qpel16_mc11:
         lowpass_const   r3
+        mov             r11, sp
+        bic             sp,  sp,  #15
         sub             sp,  sp,  #256
         mov             r0,  sp
         sub             r1,  r1,  #2
         mov             r3,  #16
         vpush           {d8-d15}
         bl              put_h264_qpel16_h_lowpass_neon
-        add             r0,  sp,  #256
-        ldrd            r0,  [r0, #64]
+        ldrd            r0,  [r11]
         mov             r3,  r2
         add             ip,  sp,  #64
         sub             r1,  r1,  r2, lsl #1
         mov             r2,  #16
-        bl              put_h264_qpel16_v_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
         vpop            {d8-d15}
-        add             sp,  sp,  #(256+8)
-        pop             {r4, pc}
-        .endfunc
+        add             sp,  r11, #8
+        pop             {r4, r11, pc}
+endfunc
 
-function ff_put_h264_qpel16_mc21_neon, export=1
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
-put_h264_qpel16_mc21:
+\type\()_h264_qpel16_mc21:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1273,33 +1477,33 @@ put_h264_qpel16_mc21:
         sub             r1,  r1,  r2, lsl #1
         sub             r1,  r1,  #2
         mov             r3,  r2
-        bl              put_h264_qpel16_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4-r5, r9-r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc31_neon, export=1
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
         add             r1,  r1,  #1
-        push            {r0, r1, r4, lr}
+        push            {r0, r1, r4, r11, lr}
         sub             r1,  r1,  #1
-        b               put_h264_qpel16_mc11
-        .endfunc
+        b               \type\()_h264_qpel16_mc11
+endfunc
 
-function ff_put_h264_qpel16_mc02_neon, export=1
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
         push            {r4, lr}
         lowpass_const   r3
         sub             r1,  r1,  r2, lsl #1
         mov             r3,  r2
         vpush           {d8-d15}
-        bl              put_h264_qpel16_v_lowpass_neon
+        bl              \type\()_h264_qpel16_v_lowpass_neon
         vpop            {d8-d15}
         pop             {r4, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc12_neon, export=1
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
-put_h264_qpel16_mc12:
+\type\()_h264_qpel16_mc12:
         lowpass_const   r3
         mov             r11, sp
         bic             sp,  sp,  #15
@@ -1314,13 +1518,13 @@ put_h264_qpel16_mc12:
         sub             r1,  r1,  r3, lsl #1
         sub             r1,  r1,  #2
         mov             r2,  r3
-        bl              put_h264_qpel16_hv_lowpass_l2_neon
+        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
         vpop            {d8-d15}
         add             sp,  r11,  #8
         pop             {r4-r5, r9-r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc22_neon, export=1
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
         push            {r4, r9-r11, lr}
         lowpass_const   r3
         mov             r11, sp
@@ -1331,43 +1535,47 @@ function ff_put_h264_qpel16_mc22_neon, export=1
         sub             sp,  sp,  #(16*12)
         mov             r4,  sp
         vpush           {d8-d15}
-        bl              put_h264_qpel16_hv_lowpass_neon
+        bl              \type\()_h264_qpel16_hv_lowpass_neon
         vpop            {d8-d15}
         mov             sp,  r11
         pop             {r4, r9-r11, pc}
-        .endfunc
+endfunc
 
-function ff_put_h264_qpel16_mc32_neon, export=1
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
         add             r1,  r1,  #1
-        b               put_h264_qpel16_mc12
-        .endfunc
+        b               \type\()_h264_qpel16_mc12
+endfunc
 
-function ff_put_h264_qpel16_mc03_neon, export=1
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
         push            {r4, lr}
         add             ip,  r1,  r2
-        b               put_h264_qpel16_mc01
-        .endfunc
+        b               \type\()_h264_qpel16_mc01
+endfunc
 
-function ff_put_h264_qpel16_mc13_neon, export=1
-        push            {r0, r1, r4, lr}
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+        push            {r0, r1, r4, r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel16_mc11
-        .endfunc
+        b               \type\()_h264_qpel16_mc11
+endfunc
 
-function ff_put_h264_qpel16_mc23_neon, export=1
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
         push            {r0, r1, r4-r5, r9-r11, lr}
         add             r1,  r1,  r2
-        b               put_h264_qpel16_mc21
-        .endfunc
+        b               \type\()_h264_qpel16_mc21
+endfunc
 
-function ff_put_h264_qpel16_mc33_neon, export=1
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
         add             r1,  r1,  #1
-        push            {r0, r1, r4, lr}
+        push            {r0, r1, r4, r11, lr}
         add             r1,  r1,  r2
         sub             r1,  r1,  #1
-        b               put_h264_qpel16_mc11
-        .endfunc
+        b               \type\()_h264_qpel16_mc11
+endfunc
+        .endm
+
+        h264_qpel16 put
+        h264_qpel16 avg
 
 @ Biweighted prediction
 
@@ -1511,7 +1719,7 @@ function biweight_h264_pixels_\w\()_neon
         biweight_\w     vmlsl.u8, vmlsl.u8
 40:     rsb             r5,  r5,  #0
         biweight_\w     vmlsl.u8, vmlal.u8
-        .endfunc
+endfunc
         .endm
 
         .macro  biweight_entry w, h, b=1
@@ -1520,7 +1728,7 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
 .if \b
         b               biweight_h264_pixels_\w\()_neon
 .endif
-        .endfunc
+endfunc
         .endm
 
         biweight_entry  16, 8
@@ -1536,3 +1744,140 @@ function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
         biweight_entry  4,  2
         biweight_entry  4,  4,  b=0
         biweight_func   4
+
+@ Weighted prediction
+
+        .macro  weight_16 add
+        vdup.8          d0,  r3
+1:      subs            ip,  ip,  #2
+        vld1.8          {d20-d21},[r0,:128], r1
+        vmull.u8        q2,  d0,  d20
+        pld             [r0]
+        vmull.u8        q3,  d0,  d21
+        vld1.8          {d28-d29},[r0,:128], r1
+        vmull.u8        q12, d0,  d28
+        pld             [r0]
+        vmull.u8        q13, d0,  d29
+        \add            q2,  q8,  q2
+        vrshl.s16       q2,  q2,  q9
+        \add            q3,  q8,  q3
+        vrshl.s16       q3,  q3,  q9
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d5,  q3
+        \add            q12, q8,  q12
+        vrshl.s16       q12, q12, q9
+        \add            q13, q8,  q13
+        vrshl.s16       q13, q13, q9
+        vqmovun.s16     d24, q12
+        vqmovun.s16     d25, q13
+        vst1.8          {d4- d5}, [r4,:128], r1
+        vst1.8          {d24-d25},[r4,:128], r1
+        bne             1b
+        pop             {r4, pc}
+        .endm
+
+        .macro  weight_8 add
+        vdup.8          d0,  r3
+1:      subs            ip,  ip,  #2
+        vld1.8          {d4},[r0,:64], r1
+        vmull.u8        q1,  d0,  d4
+        pld             [r0]
+        vld1.8          {d6},[r0,:64], r1
+        vmull.u8        q10, d0,  d6
+        \add            q1,  q8,  q1
+        pld             [r0]
+        vrshl.s16       q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        \add            q10, q8,  q10
+        vrshl.s16       q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vst1.8          {d2},[r4,:64], r1
+        vst1.8          {d4},[r4,:64], r1
+        bne             1b
+        pop             {r4, pc}
+        .endm
+
+        .macro  weight_4 add
+        vdup.8          d0,  r3
+        vmov            q1,  q8
+        vmov            q10, q8
+1:      subs            ip,  ip,  #4
+        vld1.32         {d4[0]},[r0,:32], r1
+        vld1.32         {d4[1]},[r0,:32], r1
+        vmull.u8        q1,  d0,  d4
+        pld             [r0]
+        blt             2f
+        vld1.32         {d6[0]},[r0,:32], r1
+        vld1.32         {d6[1]},[r0,:32], r1
+        vmull.u8        q10, d0,  d6
+        pld             [r0]
+        \add            q1,  q8,  q1
+        vrshl.s16       q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        \add            q10, q8,  q10
+        vrshl.s16       q10, q10, q9
+        vqmovun.s16     d4,  q10
+        vmov            q10, q8
+        vst1.32         {d2[0]},[r4,:32], r1
+        vst1.32         {d2[1]},[r4,:32], r1
+        vmov            q1,  q8
+        vst1.32         {d4[0]},[r4,:32], r1
+        vst1.32         {d4[1]},[r4,:32], r1
+        bne             1b
+        pop             {r4, pc}
+2:      \add            q1,  q8,  q1
+        vrshl.s16       q1,  q1,  q9
+        vqmovun.s16     d2,  q1
+        vst1.32         {d2[0]},[r4,:32], r1
+        vst1.32         {d2[1]},[r4,:32], r1
+        pop             {r4, pc}
+        .endm
+
+        .macro  weight_func w
+function weight_h264_pixels_\w\()_neon
+        push            {r4, lr}
+        ldr             r4,  [sp, #8]
+        cmp             r2,  #1
+        lsl             r4,  r4,  r2
+        vdup.16         q8,  r4
+        mov             r4,  r0
+        ble             20f
+        rsb             lr,  r2,  #1
+        vdup.16         q9,  lr
+        cmp             r3,  #0
+        blt             10f
+        weight_\w       vhadd.s16
+10:     rsb             r3,  r3,  #0
+        weight_\w       vhsub.s16
+20:     rsb             lr,  r2,  #0
+        vdup.16         q9,  lr
+        cmp             r3,  #0
+        blt             10f
+        weight_\w       vadd.s16
+10:     rsb             r3,  r3,  #0
+        weight_\w       vsub.s16
+endfunc
+        .endm
+
+        .macro  weight_entry w, h, b=1
+function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
+        mov             ip,  #\h
+.if \b
+        b               weight_h264_pixels_\w\()_neon
+.endif
+endfunc
+        .endm
+
+        weight_entry    16, 8
+        weight_entry    16, 16, b=0
+        weight_func     16
+
+        weight_entry    8,  16
+        weight_entry    8,  4
+        weight_entry    8,  8,  b=0
+        weight_func     8
+
+        weight_entry    4,  8
+        weight_entry    4,  2
+        weight_entry    4,  4,  b=0
+        weight_func     4