Add i420->rv16 neon converter

author Sébastien Toque <xilasz@gmail.com>

Tue, 5 Mar 2013 20:20:39 +0000 (21:20 +0100)

committer Jean-Baptiste Kempf <jb@videolan.org>

Sun, 10 Mar 2013 16:22:20 +0000 (17:22 +0100)
author Sébastien Toque <xilasz@gmail.com>
Tue, 5 Mar 2013 20:20:39 +0000 (21:20 +0100)
committer Jean-Baptiste Kempf <jb@videolan.org>
Sun, 10 Mar 2013 16:22:20 +0000 (17:22 +0100)
diff --git a/modules/arm_neon/Modules.am b/modules/arm_neon/Modules.am

index 31064859d49575f5d5f2c85914ef6ca463c8e752..decb3b8189f0eb6b622c4d9f4b8aaf11fed3cf33 100644 (file)
--- a/modules/arm_neon/Modules.am
+++ b/modules/arm_neon/Modules.am
@@ -21,6 +21,7 @@ libvolume_neon_plugin_la_LIBADD = $(AM_LIBADD)
  
  libyuv_rgb_neon_plugin_la_SOURCES = \
         i420_rgb.S \
+       i420_rv16.S \
         nv21_rgb.S \
         nv12_rgb.S \
         yuv_rgb.c
diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h

index 708d1218ee7e6db22ac84da8251f2f072a0cf82d..865315a7d1a0a261ab911b919fc3db3876b7b1a4 100644 (file)
--- a/modules/arm_neon/chroma_neon.h
+++ b/modules/arm_neon/chroma_neon.h
@@ -72,6 +72,11 @@ void i420_rgb_neon (struct yuv_pack *const out,
                      const struct yuv_planes *const in,
                      int width, int height) asm("i420_rgb_neon");
  
+/* I420 to RV16 conversion. */
+void i420_rv16_neon (struct yuv_pack *const out,
+                     const struct yuv_planes *const in,
+                     int width, int height) asm("i420_rv16_neon");
+
  /* NV21 to RGBA conversion. */
  void nv21_rgb_neon (struct yuv_pack *const out,
                      const struct yuv_planes *const in,
diff --git a/modules/arm_neon/i420_rv16.S b/modules/arm_neon/i420_rv16.S

new file mode 100644 (file)

index 0000000..cd6d269
--- /dev/null
+++ b/modules/arm_neon/i420_rv16.S
@@ -0,0 +1,227 @@
+ @*****************************************************************************
+ @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2011 Sébastien Toque
+ @                    Rémi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify it
+ @ under the terms of the GNU Lesser General Public License as published by
+ @ the Free Software Foundation; either version 2.1 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ @ GNU Lesser General Public License for more details.
+ @
+ @ You should have received a copy of the GNU Lesser General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+       .syntax unified
+       .fpu neon
+       .text
+
+/* ARM */
+#define O1     r0
+#define O2     r1
+#define WIDTH  r2
+#define HEIGHT r3
+#define Y1     r4
+#define Y2     r5
+#define U      r6
+#define V      r7
+#define YPITCH r8
+#define OPAD   r10
+#define YPAD   r11
+#define COUNT  ip
+#define OPITCH lr
+
+/* NEON */
+#define coefY  D0
+#define coefRV D1
+#define coefGU D2
+#define coefGV D3
+#define coefBU D4
+#define Rc     Q3
+#define Gc     Q4
+#define Bc     Q5
+
+#define u      D24
+#define v      D25
+#define y1     D18
+#define y2     D19
+
+#define chro_r Q6
+#define chro_g Q7
+#define chro_b Q8
+#define lumi1  Q15
+#define lumi2  Q10
+#define red16_1                Q9
+#define green16_1      Q10
+#define blue16_1       Q11
+#define red16_2                Q12
+#define green16_2      Q13
+#define blue16_2       Q14
+
+#define red1   D25
+#define green1 D26
+#define blue1  D27
+#define red2   D29
+#define green2 D30
+#define blue2  D31
+
+#define out1l  D24
+#define out1h  D25
+#define out2l  D28
+#define out2h  D29
+
+coefficients:
+    .short  -15872
+    .short    4992
+    .short  -18432
+
+       .align 2
+       .global i420_rv16_neon
+       .type   i420_rv16_neon, %function
+i420_rv16_neon:
+       push            {r4-r8,r10-r11,lr}
+       vpush           {q4-q7}
+
+       /* load arguments */
+       ldmia           r0,     {O1, OPITCH}
+       ldmia           r1,     {Y1, U, V, YPITCH}
+
+       /* round the width to be a multiple of 16 */
+       ands            OPAD, WIDTH, #15
+       sub                     WIDTH, WIDTH, OPAD
+       addne           WIDTH, WIDTH, #16
+
+       /* init constants (scale value by 64) */
+       vmov.u8         coefY, #74
+       vmov.u8         coefRV, #115
+       vmov.u8         coefGU, #14
+       vmov.u8         coefGV, #34
+       vmov.u8         coefBU, #135
+       adr                     OPAD, coefficients
+       vld1.s16        {d6[], d7[]}, [OPAD]!
+       vld1.s16        {d8[], d9[]}, [OPAD]!
+       vld1.s16        {d10[], d11[]}, [OPAD]!
+
+       /* init padding */
+       cmp                     HEIGHT, #0
+       sub                     OPAD,   OPITCH, WIDTH, lsl #1
+       sub                     YPAD,   YPITCH, WIDTH
+
+loop_row:
+       movsgt  COUNT,  WIDTH
+       add             O2,     O1,     OPITCH
+       add             Y2,     Y1,     YPITCH
+       /* exit if all rows have been processed */
+       vpople  {q4-q7}
+       pople   {r4-r8,r10-r11,pc}
+
+loop_col:
+
+       /* Common U & V */
+
+       vld1.u8 {u}, [U,:64]!
+       vld1.u8 {v}, [V,:64]!
+
+       /* Y Top Row */
+       vld2.u8 {y1,y2}, [Y1,:128]!
+
+       vmull.u8        Q14, v, coefRV
+       vmull.u8        Q11, u, coefGU
+       vmull.u8        Q13, u, coefBU
+       vmlal.u8        Q11, v, coefGV
+
+       vmull.u8        lumi2, y2, coefY
+       vmull.u8        lumi1, y1, coefY
+       vadd.s16        chro_r, Rc, Q14
+       vadd.s16        chro_b, Bc, Q13
+       vsub.s16        chro_g, Gc, Q11
+
+       pld     [U]
+       pld     [V]
+
+       /* chrominance + luminance */
+       vqadd.s16       red16_2, lumi2, chro_r
+       vqadd.s16       green16_2, lumi2, chro_g
+       vqadd.s16       blue16_2, lumi2, chro_b
+       vqadd.s16       red16_1, lumi1, chro_r
+       vqadd.s16       green16_1, lumi1, chro_g
+       vqadd.s16       blue16_1, lumi1, chro_b
+
+       /* clamp (divide by 64) */
+       vqrshrun.s16    green2, green16_2, #6
+       vqrshrun.s16    blue2, blue16_2, #6
+       vqrshrun.s16    red2, red16_2, #6
+       vqrshrun.s16    green1, green16_1, #6
+       vqrshrun.s16    red1, red16_1, #6
+       vqrshrun.s16    blue1, blue16_1, #6
+
+       pld     [Y1]
+
+       /* pack into RGB565 */
+       vshl.u8 out2l, green2, #3 // low 2a
+       vsri.u8 out2h, green2, #5 // high 2
+       vshl.u8 out1l, green1, #3 // low 1a
+       vsri.u8 out1h, green1, #5 // high 1
+       vsri.u8 out2l, blue2, #3 // low 2b
+       vsri.u8 out1l, blue1, #3 // low 1b
+
+       /* Y Bottom Row */
+       vld2.u8 {y1,y2}, [Y2,:128]!
+
+       /* Top Row output */
+       vzip.u8 out1h, out2h
+       vmull.u8        lumi2, y2, coefY
+       vzip.u8 out1l, out2l
+       vmull.u8        lumi1, y1, coefY
+       vst2.u8 {out1l, out1h}, [O1,:128]!
+       vst2.u8 {out2l, out2h}, [O1,:128]!
+
+       /* chrominance + luminance */
+       vqadd.s16       green16_2, lumi2, chro_g
+       vqadd.s16       red16_2, lumi2, chro_r
+       vqadd.s16       blue16_2, lumi2, chro_b
+       vqadd.s16       red16_1, lumi1, chro_r
+       vqadd.s16       green16_1, lumi1, chro_g
+       vqadd.s16       blue16_1, lumi1, chro_b
+
+       /* clamp (divide by 64) */
+       vqrshrun.s16    green2, green16_2, #6
+       vqrshrun.s16    blue2, blue16_2, #6
+       vqrshrun.s16    red2, red16_2, #6
+       vqrshrun.s16    green1, green16_1, #6
+       vqrshrun.s16    red1, red16_1, #6
+       vqrshrun.s16    blue1, blue16_1, #6
+
+       pld     [Y1]
+
+       /* pack into RGB565 */
+       vshl.u8 out2l, green2, #3 // low 2a
+       vsri.u8 out2h, green2, #5 // high 2
+       vshl.u8 out1l, green1, #3 // low 1a
+       vsri.u8 out1h, green1, #5 // high 1
+       vsri.u8 out2l, blue2, #3 // low 2b
+       vsri.u8 out1l, blue1, #3 // low 1b
+
+       vzip.u8 out1h, out2h
+       vzip.u8 out1l, out2l
+       vst2.u8 {out1l, out1h}, [O2,:128]!
+       vst2.u8 {out2l, out2h}, [O2,:128]!
+
+       /* next columns (x16) */
+       subs    COUNT,  COUNT,  #16
+       bgt             loop_col
+
+       /* next rows (x2) */
+       subs    HEIGHT, #2
+       add             O1,     O2,     OPAD
+       add             Y1,     Y2,     YPAD
+       add             U,      U,      YPAD,   lsr #1
+       add             V,      V,      YPAD,   lsr #1
+       b               loop_row
diff --git a/modules/arm_neon/yuv_rgb.c b/modules/arm_neon/yuv_rgb.c

index 0fb29a2081cc9b0048fef69eaeb49c8faeddf0f1..d28a27ef1e5ceb44bab6257188bc286ba7e836af 100644 (file)
--- a/modules/arm_neon/yuv_rgb.c
+++ b/modules/arm_neon/yuv_rgb.c
@@ -95,6 +95,14 @@ static void I420_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
      struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
      i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
  }
+
+static void I420_RV16 (filter_t *filter, picture_t *src, picture_t *dst)
+{
+    struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
+    struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH };
+    i420_rv16_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height);
+}
+
  static void YV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
  {
      struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch };
@@ -117,6 +125,7 @@ static void NV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst)
  }
  
  VIDEO_FILTER_WRAPPER (I420_RGBA)
+VIDEO_FILTER_WRAPPER (I420_RV16)
  VIDEO_FILTER_WRAPPER (YV12_RGBA)
  VIDEO_FILTER_WRAPPER (NV21_RGBA)
  VIDEO_FILTER_WRAPPER (NV12_RGBA)
@@ -135,6 +144,17 @@ static int Open (vlc_object_t *obj)
  
      switch (filter->fmt_out.video.i_chroma)
      {
+        case VLC_CODEC_RGB16:
+            switch (filter->fmt_in.video.i_chroma)
+            {
+                case VLC_CODEC_I420:
+                    filter->pf_video_filter = I420_RV16_Filter;
+                    break;
+                default:
+                    return VLC_EGENERIC;
+            }
+            break;
+
          case VLC_CODEC_RGB32:
              if(        filter->fmt_out.video.i_rmask != 0x000000ff
                      || filter->fmt_out.video.i_gmask != 0x0000ff00
author	Sébastien Toque <xilasz@gmail.com>
	Tue, 5 Mar 2013 20:20:39 +0000 (21:20 +0100)
committer	Jean-Baptiste Kempf <jb@videolan.org>
	Sun, 10 Mar 2013 16:22:20 +0000 (17:22 +0100)
modules/arm_neon/Modules.am		patch \| blob \| history
modules/arm_neon/chroma_neon.h		patch \| blob \| history
modules/arm_neon/i420_rv16.S	[new file with mode: 0644]	patch \| blob
modules/arm_neon/yuv_rgb.c		patch \| blob \| history