arm: implement x264_pixel_var_8x16_neon

author Janne Grunau <janne-x264@jannau.net>

Wed, 12 Mar 2014 12:16:00 +0000 (13:16 +0100)

committer Fiona Glaser <fiona@x264.com>

Wed, 12 Mar 2014 20:17:01 +0000 (13:17 -0700)
author Janne Grunau <janne-x264@jannau.net>
Wed, 12 Mar 2014 12:16:00 +0000 (13:16 +0100)
committer Fiona Glaser <fiona@x264.com>
Wed, 12 Mar 2014 20:17:01 +0000 (13:17 -0700)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S

index 644e4497788215b749a28caf7cb688a19e0ce5bc..6bc904d86093105b154c70b6903adc6771d90412 100644 (file)
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -519,6 +519,38 @@ function x264_pixel_var_8x8_neon
      b               x264_var_end
  .endfunc
  
+function x264_pixel_var_8x16_neon
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d18}, [r0,:64], r1
+    vmull.u8        q1,  d16, d16
+    vmovl.u8        q0,  d16
+    vld1.64         {d20}, [r0,:64], r1
+    vmull.u8        q2,  d18, d18
+    vaddw.u8        q0,  q0,  d18
+
+    mov             ip,  #12
+
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q1,   q14,  d20, vpaddl.u16
+    vld1.64         {d16}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q2,   q15,  d22, vpaddl.u16
+
+1:  subs            ip,  ip,  #4
+    vld1.64         {d18}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q14,  q12, d16
+    vld1.64         {d20}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q15,  q13, d18
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q12,  q14, d20
+    beq             2f
+    vld1.64         {d16}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q13,  q15, d22
+    b               1b
+2:
+    VAR_SQR_SUM     q2,  q13,  q15, d22
+    b               x264_var_end
+.endfunc
+
  function x264_pixel_var_16x16_neon
      vld1.64         {d16-d17}, [r0,:128], r1
      vmull.u8        q12, d16, d16
diff --git a/common/arm/pixel.h b/common/arm/pixel.h

index 1024ee7839bb2c2cb589f44e54ad66af30846f8a..acd517b94242656e87bdefd4b734d99ed3312a5b 100644 (file)
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -56,6 +56,7 @@ int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
  int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
  
  uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
  uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
  int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
  
diff --git a/common/pixel.c b/common/pixel.c

index 54ada1ea9a3bc8856c0d975e7707c22f2fb8515d..b760a1175a37c63c90f1d2560e54cfa8c493c5ea 100644 (file)
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1347,6 +1347,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
          pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
          pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
          pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
+        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
          pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
          pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
author	Janne Grunau <janne-x264@jannau.net>
	Wed, 12 Mar 2014 12:16:00 +0000 (13:16 +0100)
committer	Fiona Glaser <fiona@x264.com>
	Wed, 12 Mar 2014 20:17:01 +0000 (13:17 -0700)
common/arm/pixel-a.S		patch \| blob \| history
common/arm/pixel.h		patch \| blob \| history
common/pixel.c		patch \| blob \| history