]> git.sesse.net Git - vlc/blobdiff - modules/arm_neon/i420_yuy2.S
Move ARM NEON optimizations to arm_neon/
[vlc] / modules / arm_neon / i420_yuy2.S
diff --git a/modules/arm_neon/i420_yuy2.S b/modules/arm_neon/i420_yuy2.S
new file mode 100644 (file)
index 0000000..995cf62
--- /dev/null
@@ -0,0 +1,108 @@
+ @*****************************************************************************
+ @ i420_yuyv_neon.S : ARM NEONv1 I420 to YUYV chroma conversion
+ @*****************************************************************************
+ @ Copyright (C) 2009 RĂ©mi Denis-Courmont
+ @
+ @ This program is free software; you can redistribute it and/or modify
+ @ it under the terms of the GNU General Public License as published by
+ @ the Free Software Foundation; either version 2 of the License, or
+ @ (at your option) any later version.
+ @
+ @ This program is distributed in the hope that it will be useful,
+ @ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ @ GNU General Public License for more details.
+ @
+ @ You should have received a copy of the GNU General Public License
+ @ along with this program; if not, write to the Free Software Foundation,
+ @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ @****************************************************************************/
+
+       .fpu neon
+       .text
+
+#define O1     r0
+#define        O2      r1
+#define        PITCH   r2
+#define        HEIGHT  r3
+#define        Y1      r4
+#define        Y2      r5
+#define        U       r6
+#define        V       r7
+#define        END_O1  r12
+
+       .align
+       .global i420_yuyv_neon
+       .type   i420_yuyv_neon, %function
+i420_yuyv_neon:
+       push            {r4-r7, lr}
+       ldmia           r1,     {Y1, U, V}
+       add             O2,     O1,     PITCH, lsl #1
+       add             Y2,     Y1,     PITCH
+1:
+       mov             END_O1, O2
+       pld             [Y2]
+2:
+       pld             [U, #64]
+       vld1.u8         {d2},           [U,:64]!
+       pld             [V, #64]
+       vld1.u8         {d3},           [V,:64]!
+       pld             [Y1, #64]
+       vzip.u8         d2,     d3
+       vld1.u8         {q0},           [Y1,:128]!
+       pld             [Y2, #64]
+       vmov            q3,     q1
+       vzip.u8         q0,     q1
+       vld1.u8         {q2},           [Y2,:128]!
+       vzip.u8         q2,     q3
+       vst1.u8         {q0-q1},        [O1,:128]!
+       vst1.u8         {q2-q3},        [O2,:128]!
+
+       cmp             O1,     END_O1
+       bne             2b
+
+       sub             HEIGHT, #2
+       mov             O1,     O2
+       add             O2,     PITCH,  lsl #1
+       mov             Y1,     Y2
+       add             Y2,     PITCH
+
+       cmp             HEIGHT, #0
+       bne             1b
+
+       pop             {r4-r7, pc}
+
+       .global i420_uyvy_neon
+       .type   i420_uyvy_neon, %function
+i420_uyvy_neon:
+       push            {r4-r7, lr}
+       ldmia           r1,     {Y1, U, V}
+       add             O2,     O1,     PITCH, lsl #1
+       add             Y2,     Y1,     PITCH
+1:
+       mov             END_O1, O2
+2:
+       vld1.u8         {d0},           [U,:64]!
+       vld1.u8         {d1},           [V,:64]!
+       vzip.u8         d0,     d1
+       vld1.u8         {q1},           [Y1,:128]!
+       vmov            q2,     q0
+       vzip.u8         q0,     q1
+       vld1.u8         {q3},           [Y2,:128]!
+       vzip.u8         q2,     q3
+       vst1.u8         {q0-q1},        [O1,:128]!
+       vst1.u8         {q2-q3},        [O2,:128]!
+
+       cmp             O1,     END_O1
+       bne             2b
+
+       sub             HEIGHT, #2
+       mov             O1,     O2
+       add             O2,     PITCH,  lsl #1
+       mov             Y1,     Y2
+       add             Y2,     PITCH
+
+       cmp             HEIGHT, #0
+       bne             1b
+
+       pop             {r4-r7, pc}