1 /*****************************************************************************
2 * asm.S: AArch64 utility macros
3 *****************************************************************************
4 * Copyright (C) 2008-2015 x264 project
6 * Authors: Mans Rullgard <mans@mansr.com>
7 * David Conrad <lessen42@gmail.com>
8 * Janne Grunau <janne-x264@jannau.net>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
54 .macro function name, export=0, align=2
56 ELF .size \name, . - \name
63 .global EXTERN_ASM\name
64 ELF .type EXTERN_ASM\name, %function
65 FUNC .func EXTERN_ASM\name
68 ELF .type \name, %function
74 .macro const name, align=2
76 ELF .size \name, . - \name
86 #if defined(PIC) && defined(__APPLE__)
88 add \rd, \rd, \val@PAGEOFF
91 add \rd, \rd, :lo12:\val
97 #define GLUE(a, b) a ## b
98 #define JOIN(a, b) GLUE(a, b)
99 #define X(s) JOIN(EXTERN_ASM, s)
101 #define FDEC_STRIDE 32
102 #define FENC_STRIDE 16
105 .macro SUMSUB_AB sum, sub, a, b
110 .macro unzip t1, t2, s1, s2
115 .macro transpose t1, t2, s1, s2
120 .macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
121 transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s
122 transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s
123 transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h
124 transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h
127 .macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
128 transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s
129 transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s
130 transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h
131 transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h
135 .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
136 trn1 \r8\().8H, \r0\().8H, \r1\().8H
137 trn2 \r9\().8H, \r0\().8H, \r1\().8H
138 trn1 \r1\().8H, \r2\().8H, \r3\().8H
139 trn2 \r3\().8H, \r2\().8H, \r3\().8H
140 trn1 \r0\().8H, \r4\().8H, \r5\().8H
141 trn2 \r5\().8H, \r4\().8H, \r5\().8H
142 trn1 \r2\().8H, \r6\().8H, \r7\().8H
143 trn2 \r7\().8H, \r6\().8H, \r7\().8H
145 trn1 \r4\().4S, \r0\().4S, \r2\().4S
146 trn2 \r2\().4S, \r0\().4S, \r2\().4S
147 trn1 \r6\().4S, \r5\().4S, \r7\().4S
148 trn2 \r7\().4S, \r5\().4S, \r7\().4S
149 trn1 \r5\().4S, \r9\().4S, \r3\().4S
150 trn2 \r9\().4S, \r9\().4S, \r3\().4S
151 trn1 \r3\().4S, \r8\().4S, \r1\().4S
152 trn2 \r8\().4S, \r8\().4S, \r1\().4S
154 trn1 \r0\().2D, \r3\().2D, \r4\().2D
155 trn2 \r4\().2D, \r3\().2D, \r4\().2D
157 trn1 \r1\().2D, \r5\().2D, \r6\().2D
158 trn2 \r5\().2D, \r5\().2D, \r6\().2D
160 trn2 \r6\().2D, \r8\().2D, \r2\().2D
161 trn1 \r2\().2D, \r8\().2D, \r2\().2D
163 trn1 \r3\().2D, \r9\().2D, \r7\().2D
164 trn2 \r7\().2D, \r9\().2D, \r7\().2D
167 .macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
168 trn1 \t0\().16b, \r0\().16b, \r1\().16b
169 trn2 \t1\().16b, \r0\().16b, \r1\().16b
170 trn1 \r1\().16b, \r2\().16b, \r3\().16b
171 trn2 \r3\().16b, \r2\().16b, \r3\().16b
172 trn1 \r0\().16b, \r4\().16b, \r5\().16b
173 trn2 \r5\().16b, \r4\().16b, \r5\().16b
174 trn1 \r2\().16b, \r6\().16b, \r7\().16b
175 trn2 \r7\().16b, \r6\().16b, \r7\().16b
177 trn1 \r4\().8h, \r0\().8h, \r2\().8h
178 trn2 \r2\().8h, \r0\().8h, \r2\().8h
179 trn1 \r6\().8h, \r5\().8h, \r7\().8h
180 trn2 \r7\().8h, \r5\().8h, \r7\().8h
181 trn1 \r5\().8h, \t1\().8h, \r3\().8h
182 trn2 \t1\().8h, \t1\().8h, \r3\().8h
183 trn1 \r3\().8h, \t0\().8h, \r1\().8h
184 trn2 \t0\().8h, \t0\().8h, \r1\().8h
186 trn1 \r0\().4s, \r3\().4s, \r4\().4s
187 trn2 \r4\().4s, \r3\().4s, \r4\().4s
189 trn1 \r1\().4s, \r5\().4s, \r6\().4s
190 trn2 \r5\().4s, \r5\().4s, \r6\().4s
192 trn2 \r6\().4s, \t0\().4s, \r2\().4s
193 trn1 \r2\().4s, \t0\().4s, \r2\().4s
195 trn1 \r3\().4s, \t1\().4s, \r7\().4s
196 trn2 \r7\().4s, \t1\().4s, \r7\().4s
199 .macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
200 trn1 \t4\().16b, \r0\().16b, \r1\().16b
201 trn2 \t5\().16b, \r0\().16b, \r1\().16b
202 trn1 \t6\().16b, \r2\().16b, \r3\().16b
203 trn2 \t7\().16b, \r2\().16b, \r3\().16b
205 trn1 \r0\().8h, \t4\().8h, \t6\().8h
206 trn2 \r2\().8h, \t4\().8h, \t6\().8h
207 trn1 \r1\().8h, \t5\().8h, \t7\().8h
208 trn2 \r3\().8h, \t5\().8h, \t7\().8h
211 .macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
212 trn1 \t4\().8b, \r0\().8b, \r1\().8b
213 trn2 \t5\().8b, \r0\().8b, \r1\().8b
214 trn1 \t6\().8b, \r2\().8b, \r3\().8b
215 trn2 \t7\().8b, \r2\().8b, \r3\().8b
217 trn1 \r0\().4h, \t4\().4h, \t6\().4h
218 trn2 \r2\().4h, \t4\().4h, \t6\().4h
219 trn1 \r1\().4h, \t5\().4h, \t7\().4h
220 trn2 \r3\().4h, \t5\().4h, \t7\().4h