From 6f04b146875c45e6f7845a7bb5fb7fdf8e7534f1 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 3 Sep 2015 09:30:44 +0300
Subject: [PATCH] arm: Implement x264_mbtree_propagate_{cost, list}_neon

The cost function could be simplified to avoid having to clobber
q4/q5, but this requires reordering instructions which increase
the total runtime.

checkasm timing       Cortex-A7      A8      A9
mbtree_propagate_cost_c      63702   155835  62829
mbtree_propagate_cost_neon   17199   10454   11106

mbtree_propagate_list_c      104203  108949  84532
mbtree_propagate_list_neon   82035   78348   60410
---
 common/arm/mc-a.S | 119 ++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/mc-c.c |   8 ++++
 2 files changed, 127 insertions(+)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 5e0c117d..30d1c1ad 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -28,6 +28,11 @@
 
 #include "asm.S"
 
+.section .rodata
+.align 4
+pw_0to15:
+.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
 .text
 
 // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
@@ -1760,3 +1765,117 @@ function integral_init8v_neon
 2:
     bx              lr
 endfunc
+
+function x264_mbtree_propagate_cost_neon
+    push            {r4-r5,lr}
+    ldrd            r4, r5, [sp, #12]
+    ldr             lr, [sp, #20]
+    vld1.32         {d6[], d7[]},  [r5]
+8:
+    subs            lr,  lr,  #8
+    vld1.16         {q8},  [r1]!
+    vld1.16         {q9},  [r2]!
+    vld1.16         {q10}, [r3]!
+    vld1.16         {q11}, [r4]!
+    vbic.u16        q10, #0xc000
+    vmin.u16        q10, q9,  q10
+    vmull.u16       q12, d18, d22           @ propagate_intra
+    vmull.u16       q13, d19, d23           @ propagate_intra
+    vsubl.u16       q14, d18, d20           @ propagate_num
+    vsubl.u16       q15, d19, d21           @ propagate_num
+    vmovl.u16       q10, d18                @ propagate_denom
+    vmovl.u16       q11, d19                @ propagate_denom
+    vmovl.u16       q9,  d17
+    vmovl.u16       q8,  d16
+    vcvt.f32.s32    q12, q12
+    vcvt.f32.s32    q13, q13
+    vcvt.f32.s32    q14, q14
+    vcvt.f32.s32    q15, q15
+    vcvt.f32.s32    q10, q10
+    vcvt.f32.s32    q11, q11
+    vrecpe.f32      q0,  q10
+    vrecpe.f32      q1,  q11
+    vcvt.f32.s32    q8,  q8
+    vcvt.f32.s32    q9,  q9
+    vrecps.f32      q10, q0,  q10
+    vrecps.f32      q11, q1,  q11
+    vmla.f32        q8,  q12, q3            @ propagate_amount
+    vmla.f32        q9,  q13, q3            @ propagate_amount
+    vmul.f32        q0,  q0,  q10
+    vmul.f32        q1,  q1,  q11
+    vmul.f32        q8,  q8,  q14
+    vmul.f32        q9,  q9,  q15
+    vmul.f32        q0,  q8,  q0
+    vmul.f32        q1,  q9,  q1
+    vcvt.s32.f32    q0,  q0
+    vcvt.s32.f32    q1,  q1
+    vqmovn.s32      d0,  q0
+    vqmovn.s32      d1,  q1
+    vst1.16         {q0},  [r0]!
+    bgt             8b
+    pop             {r4-r5,pc}
+endfunc
+
+function x264_mbtree_propagate_list_internal_neon
+    vld2.16         {d4[], d5[]}, [sp]      @ bipred_weight, mb_y
+    movrel          r12, pw_0to15
+    vmov.u16        q10, #0xc000
+    vld1.16         {q0},  [r12, :128]      @h->mb.i_mb_x,h->mb.i_mb_y
+    vmov.u32        q11, #4
+    vmov.u8         q3,  #32
+    vdup.u16        q8,  d5[0]              @ mb_y
+    vzip.u16        q0,  q8
+    ldr             r12, [sp, #8]
+8:
+    subs            r12, r12,  #8
+    vld1.16         {q14},  [r1, :128]!      @ propagate_amount
+    vld1.16         {q15},  [r2, :128]!      @ lowres_cost
+    vld1.16         {q8, q9},  [r0, :128]!
+    vand            q15, q15, q10
+    vceq.u16        q1,  q15, q10
+    vmull.u16       q12, d28, d4
+    vmull.u16       q13, d29, d4
+    vrshrn.u32      d30, q12, #6
+    vrshrn.u32      d31, q13, #6
+    vbsl            q1,  q15, q14           @ if( lists_used == 3 )
+    @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+    vshr.s16        q12, q8,  #5
+    vshr.s16        q13, q9,  #5
+    vuzp.16         q8,  q9                 @ x & 31, y & 31
+    vadd.s16        q12, q12, q0
+    vadd.s16        q0,  q0,  q11
+    vmovn.i16       d16, q8
+    vmovn.i16       d17, q9
+    vadd.s16        q13, q13, q0
+    vbic.i16        q8,  #128+64+32
+    vadd.s16        q0,  q0,  q11
+    vbic.i16        q8,  #(128+64+32)<<8
+    vst1.16         {q12, q13},  [r3, :128]!
+    vsub.i8         q9,  q3,  q8
+    vmull.u8        q12, d17, d16           @ idx3weight = y*x
+    vmull.u8        q14, d19, d16           @ idx1weight = (32-y)*x
+    vmull.u8        q15, d19, d18           @ idx0weight = (32-y)*(32-x)
+    vmull.u8        q13, d17, d18           @ idx2weight = y*(32-x)
+    vmull.u16       q9,  d28, d2            @ idx1weight
+    vmull.u16       q8,  d29, d3
+    vmull.u16       q14, d30, d2            @ idx0weight
+    vmull.u16       q15, d31, d3
+    vrshrn.u32      d18, q9,  #10           @ idx1weight
+    vrshrn.u32      d19, q8,  #10
+    vrshrn.u32      d16, q14, #10           @ idx0weight
+    vrshrn.u32      d17, q15, #10
+    vmull.u16       q14, d24, d2            @ idx3weight
+    vmull.u16       q15, d25, d3
+    vzip.16         q8,  q9
+    vmull.u16       q12, d26, d2            @ idx2weight
+    vmull.u16       q13, d27, d3
+    vst1.16         {q8, q9},   [r3, :128]!
+    vrshrn.u32      d19, q15, #10           @ idx3weight
+    vrshrn.u32      d18, q14, #10
+    vrshrn.u32      d16, q12, #10           @ idx2weight
+    vrshrn.u32      d17, q13, #10
+    vzip.16         q8,  q9
+    vst1.16         {q8, q9},   [r3, :128]!
+    bge             8b
+    bx              lr
+endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index dd86fb24..0ead7b02 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
 void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
 void integral_init8v_neon( uint16_t *, intptr_t );
 
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -226,6 +229,8 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
 }
 #endif // !HIGH_BIT_DEPTH
 
+PROPAGATE_LIST(neon)
+
 void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_ARMV6) )
@@ -281,6 +286,9 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->integral_init8h = integral_init8h_neon;
     pf->integral_init4v = integral_init4v_neon;
     pf->integral_init8v = integral_init8v_neon;
+
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
 #endif // !HIGH_BIT_DEPTH
 
 // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
-- 
2.39.2